Apply .clangformat to all repo source files

Change-Id: I7e79c6058f0303f9a98911e3b7dd2e8596079344 [ROCm/clr commit: 9e47fccc89]
2018-03-12 11:29:03 +05:30
parent ecbb701440
commit 46ddefedee
293 changed files with 43980 additions and 45830 deletions
@@ -2,39 +2,27 @@

 cl::OptionCategory ToolTemplateCategory("CUDA to HIP source translator options");

-cl::opt<std::string> OutputFilename("o",
-  cl::desc("Output filename"),
-  cl::value_desc("filename"),
-  cl::cat(ToolTemplateCategory));
+cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"),
+                                    cl::cat(ToolTemplateCategory));

 cl::opt<bool> Inplace("inplace",
-  cl::desc("Modify input file inplace, replacing input with hipified output, save backup in .prehip file"),
-  cl::value_desc("inplace"),
-  cl::cat(ToolTemplateCategory));
+                      cl::desc("Modify input file inplace, replacing input with hipified output, "
+                               "save backup in .prehip file"),
+                      cl::value_desc("inplace"), cl::cat(ToolTemplateCategory));

-cl::opt<bool> NoBackup("no-backup",
-  cl::desc("Don't create a backup file for the hipified source"),
-  cl::value_desc("no-backup"),
-  cl::cat(ToolTemplateCategory));
+cl::opt<bool> NoBackup("no-backup", cl::desc("Don't create a backup file for the hipified source"),
+                       cl::value_desc("no-backup"), cl::cat(ToolTemplateCategory));

-cl::opt<bool> NoOutput("no-output",
-  cl::desc("Don't write any translated output to stdout"),
-  cl::value_desc("no-output"),
-  cl::cat(ToolTemplateCategory));
+cl::opt<bool> NoOutput("no-output", cl::desc("Don't write any translated output to stdout"),
+                       cl::value_desc("no-output"), cl::cat(ToolTemplateCategory));

-cl::opt<bool> PrintStats("print-stats",
-  cl::desc("Print translation statistics"),
-  cl::value_desc("print-stats"),
-  cl::cat(ToolTemplateCategory));
+cl::opt<bool> PrintStats("print-stats", cl::desc("Print translation statistics"),
+                         cl::value_desc("print-stats"), cl::cat(ToolTemplateCategory));

-cl::opt<std::string> OutputStatsFilename("o-stats",
-  cl::desc("Output filename for statistics"),
-  cl::value_desc("filename"),
-  cl::cat(ToolTemplateCategory));
+cl::opt<std::string> OutputStatsFilename("o-stats", cl::desc("Output filename for statistics"),
+                                         cl::value_desc("filename"), cl::cat(ToolTemplateCategory));

-cl::opt<bool> Examine("examine",
-  cl::desc("Combines -no-output and -print-stats options"),
-  cl::value_desc("examine"),
-  cl::cat(ToolTemplateCategory));
+cl::opt<bool> Examine("examine", cl::desc("Combines -no-output and -print-stats options"),
+                      cl::value_desc("examine"), cl::cat(ToolTemplateCategory));

 cl::extrahelp CommonHelp(ct::CommonOptionsParser::HelpMessage);
@@ -25,7 +25,8 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
        const auto found = CUDA_RENAMES_MAP().find(name);
        if (found != CUDA_RENAMES_MAP().end()) {
            StringRef repName = found->second.hipName;
-            hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.unsupported};
+            hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL,
+                                  ApiTypes::API_RUNTIME, found->second.unsupported};
            Statistics::current().incrementCounter(counter, name.str());

            if (!counter.unsupported) {
@@ -77,7 +78,8 @@ void HipifyAction::RewriteToken(const clang::Token& t) {
    if (found->second.unsupported) {
        // An unsupported identifier? Curses! Warn the user.
        clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
-        const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier unsupported in hip");
+        const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning,
+                                           "CUDA identifier unsupported in hip");
        DE.Report(sl, ID);
        return;
    }
@@ -94,8 +96,10 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan
    clang::SourceLocation begin = exprRange.getBegin();
    clang::SourceLocation end = exprRange.getEnd();

-    bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
-    bool endSafe = !SM.isMacroBodyExpansion(end) || clang::Lexer::isAtEndOfMacroExpansion(end, SM, clang::LangOptions{});
+    bool beginSafe = !SM.isMacroBodyExpansion(begin) ||
+                     clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
+    bool endSafe = !SM.isMacroBodyExpansion(end) ||
+                   clang::Lexer::isAtEndOfMacroExpansion(end, SM, clang::LangOptions{});

    if (beginSafe && endSafe) {
        return {SM.getFileLoc(begin), SM.getFileLoc(end)};
@@ -120,7 +124,9 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa


 StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
-    return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
+    return clang::Lexer::getSourceText(
+        clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM,
+        clang::LangOptions(), nullptr);
 }

 /**
@@ -135,28 +141,36 @@ std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Exp
    }
 }

-} // anonymous namespace
+}  // anonymous namespace

-bool HipifyAction::Exclude(const hipCounter & hipToken) {
+bool HipifyAction::Exclude(const hipCounter& hipToken) {
    switch (hipToken.type) {
        case CONV_INCLUDE_CUDA_MAIN_H:
            switch (hipToken.apiType) {
                case API_DRIVER:
                case API_RUNTIME:
-                    if (insertedRuntimeHeader) { return true; }
+                    if (insertedRuntimeHeader) {
+                        return true;
+                    }
                    insertedRuntimeHeader = true;
                    return false;
                case API_BLAS:
-                  if (insertedBLASHeader) { return true; }
+                    if (insertedBLASHeader) {
+                        return true;
+                    }
                    insertedBLASHeader = true;
                    return false;
                case API_RAND:
                    if (hipToken.hipName == "hiprand_kernel.h") {
-                        if (insertedRAND_kernelHeader) { return true; }
+                        if (insertedRAND_kernelHeader) {
+                            return true;
+                        }
                        insertedRAND_kernelHeader = true;
                        return false;
                    } else if (hipToken.hipName == "hiprand.h") {
-                        if (insertedRANDHeader) { return true; }
+                        if (insertedRANDHeader) {
+                            return true;
+                        }
                        insertedRANDHeader = true;
                        return false;
                    }
@@ -167,7 +181,9 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
        case CONV_INCLUDE:
            switch (hipToken.apiType) {
                case API_RAND:
-                    if (insertedRAND_kernelHeader) { return true; }
+                    if (insertedRAND_kernelHeader) {
+                        return true;
+                    }
                    insertedRAND_kernelHeader = true;
                    return false;
                default:
@@ -180,13 +196,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
    return false;
 }

-void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
-                                      const clang::Token&,
-                                      StringRef file_name,
-                                      bool is_angled,
+void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, const clang::Token&,
+                                      StringRef file_name, bool is_angled,
                                      clang::CharSourceRange filename_range,
-                                      const clang::FileEntry*, StringRef,
-                                      StringRef, const clang::Module*) {
+                                      const clang::FileEntry*, StringRef, StringRef,
+                                      const clang::Module*) {
    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
    if (!SM.isWrittenInMainFile(hash_loc)) {
        return;
@@ -208,7 +222,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
    clang::SourceLocation sl = filename_range.getBegin();
    if (found->second.unsupported) {
        clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
-        DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header"));
+        DE.Report(sl,
+                  DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header"));
        return;
    }

@@ -220,19 +235,22 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
        if (is_angled) {
            newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
        } else {
-            newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
+            newInclude =
+                llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
        }
    } else {
-        // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
+        // hashLoc is location of the '#', thus replacing the whole include directive by empty
+        // newInclude starting with '#'.
        sl = hash_loc;
    }
-    const char *B = SM.getCharacterData(sl);
-    const char *E = SM.getCharacterData(filename_range.getEnd());
+    const char* B = SM.getCharacterData(sl);
+    const char* E = SM.getCharacterData(filename_range.getEnd());
    ct::Replacement Rep(SM, sl, E - B, newInclude);
    insertReplacement(Rep, clang::FullSourceLoc{sl, SM});
 }

-void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
+void HipifyAction::PragmaDirective(clang::SourceLocation Loc,
+                                   clang::PragmaIntroducerKind Introducer) {
    if (pragmaOnce) {
        return;
    }
@@ -266,7 +284,8 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
    const clang::Expr& calleeExpr = *(launchKernel->getCallee());
    OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";

-    // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
+    // Next up are the four kernel configuration parameters, the last two of which are optional and
+    // default to zero.
    const clang::CallExpr& config = *(launchKernel->getConfig());

    // Copy the two dimensional arguments verbatim.
@@ -293,11 +312,14 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc

    OS << ")";

-    clang::SourceRange replacementRange = getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()});
+    clang::SourceRange replacementRange =
+        getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()});
    clang::SourceLocation launchStart = replacementRange.getBegin();
    clang::SourceLocation launchEnd = replacementRange.getEnd();

-    size_t length = SM->getCharacterData(clang::Lexer::getLocForEndOfToken(launchEnd, 0, *SM, DefaultLangOptions)) - SM->getCharacterData(launchStart);
+    size_t length = SM->getCharacterData(
+                        clang::Lexer::getLocForEndOfToken(launchEnd, 0, *SM, DefaultLangOptions)) -
+                    SM->getCharacterData(launchStart);

    ct::Replacement Rep(*SM, launchStart, length, OS.str());
    clang::FullSourceLoc fullSL(launchStart, *SM);
@@ -308,7 +330,8 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
    return true;
 }

-bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
+bool HipifyAction::cudaSharedIncompleteArrayVar(
+    const clang::ast_matchers::MatchFinder::MatchResult& Result) {
    StringRef refName = "cudaSharedIncompleteArrayVar";
    auto* sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(refName);
    if (!sharedVar) {
@@ -356,7 +379,8 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
    return true;
 }

-void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) {
+void HipifyAction::insertReplacement(const ct::Replacement& rep,
+                                     const clang::FullSourceLoc& fullSL) {
    llcompat::insertReplacement(*replacements, rep);
    if (PrintStats) {
        rep.getLength();
@@ -365,22 +389,19 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu
    }
 }

-std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) {
+std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI,
+                                                                    llvm::StringRef) {
    Finder.reset(new clang::ast_matchers::MatchFinder);

    // Replace the <<<...>>> language extension with a hip kernel launch
-    Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
-
    Finder->addMatcher(
-        mat::varDecl(
-            mat::isExpansionInMainFile(),
-            mat::allOf(
-                mat::hasAttr(clang::attr::CUDAShared),
-                mat::hasType(mat::incompleteArrayType())
-            )
-        ).bind("cudaSharedIncompleteArrayVar"),
-        this
-    );
+        mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
+
+    Finder->addMatcher(mat::varDecl(mat::isExpansionInMainFile(),
+                                    mat::allOf(mat::hasAttr(clang::attr::CUDAShared),
+                                               mat::hasType(mat::incompleteArrayType())))
+                           .bind("cudaSharedIncompleteArrayVar"),
+                       this);

    // Ownership is transferred to the caller...
    return Finder->newASTConsumer();
@@ -389,9 +410,9 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
 void HipifyAction::EndSourceFileAction() {
    // Insert the hip header, if we didn't already do it by accident during substitution.
    if (!insertedRuntimeHeader) {
-        // It's not sufficient to just replace CUDA headers with hip ones, because numerous CUDA headers are
-        // implicitly included by the compiler. Instead, we _delete_ CUDA headers, and unconditionally insert
-        // one copy of the hip include into every file.
+        // It's not sufficient to just replace CUDA headers with hip ones, because numerous CUDA
+        // headers are implicitly included by the compiler. Instead, we _delete_ CUDA headers, and
+        // unconditionally insert one copy of the hip include into every file.
        clang::SourceManager& SM = getCompilerInstance().getSourceManager();
        clang::SourceLocation sl;
        if (pragmaOnce) {
@@ -418,22 +439,25 @@ namespace {
 class PPCallbackProxy : public clang::PPCallbacks {
    HipifyAction& hipifyAction;

-public:
-    explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}
+   public:
+    explicit PPCallbackProxy(HipifyAction& action) : hipifyAction(action) {}

    void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
-                            StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
-                            const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
+                            StringRef file_name, bool is_angled,
+                            clang::CharSourceRange filename_range, const clang::FileEntry* file,
+                            StringRef search_path, StringRef relative_path,
                            const clang::Module* imported) override {
-        hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled, filename_range, file, search_path, relative_path, imported);
+        hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled,
+                                        filename_range, file, search_path, relative_path, imported);
    }

-    void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) override {
+    void PragmaDirective(clang::SourceLocation Loc,
+                         clang::PragmaIntroducerKind Introducer) override {
        hipifyAction.PragmaDirective(Loc, Introducer);
    }
 };

-}
+}  // namespace

 void HipifyAction::ExecuteAction() {
    clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
@@ -444,10 +468,10 @@ void HipifyAction::ExecuteAction() {
    clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
    RawLex.SetKeepWhitespaceMode(true);

-    // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
-    // information to tell the difference between identifiers, string literals, and "other stuff". It also
-    // ignores preprocessor directives, so this transformation will operate inside preprocessor-deleted
-    // code.
+    // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us
+    // enough information to tell the difference between identifiers, string literals, and "other
+    // stuff". It also ignores preprocessor directives, so this transformation will operate inside
+    // preprocessor-deleted code.
    clang::Token RawTok;
    RawLex.LexFromRawLexer(RawTok);
    while (RawTok.isNot(clang::tok::eof)) {
@@ -15,14 +15,15 @@ namespace ct = clang::tooling;
 */
 class HipifyAction : public clang::ASTFrontendAction,
                     public clang::ast_matchers::MatchFinder::MatchCallback {
-private:
+   private:
    ct::Replacements* replacements;
    std::unique_ptr<clang::ast_matchers::MatchFinder> Finder;

-    /// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
-    // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
-    // not, we insert it at the top of the file when we finish processing it.
-    // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
+    /// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with
+    /// equivalent
+    // ones, and track - using this flag - if the result led to us including the hip runtime header.
+    // If it did not, we insert it at the top of the file when we finish processing it. This
+    // approach means we do the best it's possible to do w.r.t preserving the user's include order.
    bool insertedRuntimeHeader = false;
    bool insertedBLASHeader = false;
    bool insertedRANDHeader = false;
@@ -40,12 +41,11 @@ private:
    /**
     * Replace a CUDA identifier with the corresponding hip identifier, if applicable.
     */
-    void RewriteToken(const clang::Token &t);
+    void RewriteToken(const clang::Token& t);

-public:
-    explicit HipifyAction(ct::Replacements *replacements):
-        clang::ASTFrontendAction(),
-        replacements(replacements) {}
+   public:
+    explicit HipifyAction(ct::Replacements* replacements)
+        : clang::ASTFrontendAction(), replacements(replacements) {}

    // MatchCallback listeners
    bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
@@ -55,24 +55,21 @@ public:
    /**
     * Called by the preprocessor for each include directive during the non-raw lexing pass.
     */
-    void InclusionDirective(clang::SourceLocation hash_loc,
-                            const clang::Token &include_token,
-                            StringRef file_name,
-                            bool is_angled,
-                            clang::CharSourceRange filename_range,
-                            const clang::FileEntry *file,
-                            StringRef search_path,
-                            StringRef relative_path,
-                            const clang::Module *imported);
+    void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
+                            StringRef file_name, bool is_angled,
+                            clang::CharSourceRange filename_range, const clang::FileEntry* file,
+                            StringRef search_path, StringRef relative_path,
+                            const clang::Module* imported);

    /**
-    * Called by the preprocessor for each pragma directive during the non-raw lexing pass.
-    */
+     * Called by the preprocessor for each pragma directive during the non-raw lexing pass.
+     */
    void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer);

-protected:
+   protected:
    /**
-     * Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
+     * Add a Replacement for the current file. These will all be applied after executing the
+     * FrontendAction.
     */
    void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);

@@ -87,12 +84,13 @@ protected:
    void EndSourceFileAction() override;

    /**
-     * MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an
-     * interest for.
+     * MatchCallback API entry point. Called by the AST visitor while searching the AST for things
+     * we registered an interest for.
     */
    void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;

-    std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
+    std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance& CI,
+                                                          llvm::StringRef InFile) override;

-    bool Exclude(const hipCounter & hipToken);
+    bool Exclude(const hipCounter& hipToken);
 };
@@ -32,7 +32,8 @@ void insertReplacement(ct::Replacements& replacements, const ct::Replacement& re
 #endif
 }

-void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token *start, size_t len, bool DisableMacroExpansion) {
+void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token* start, size_t len,
+                                  bool DisableMacroExpansion) {
 #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 8)
    _pp.EnterTokenStream(start, len, false, DisableMacroExpansion);
 #else
@@ -40,4 +41,4 @@ void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token *
 #endif
 }

-} // namespace llcompat
+}  // namespace llcompat
@@ -18,9 +18,9 @@ namespace llcompat {
 * remain unchanged, so let's be slightly ugly about it here. :D
 */
 #if LLVM_VERSION_MAJOR > 4
-    #define GET_NUM_ARGS() getNumParams()
+#define GET_NUM_ARGS() getNumParams()
 #else
-    #define GET_NUM_ARGS() getNumArgs()
+#define GET_NUM_ARGS() getNumArgs()
 #endif

 void PrintStackTraceOnErrorSignal();
@@ -41,9 +41,7 @@ void insertReplacement(ct::Replacements& replacements, const ct::Replacement& re
 /**
 * Version-agnostic version of Preprocessor::EnterTokenStream().
 */
-void EnterPreprocessorTokenStream(clang::Preprocessor& _pp,
-                                  const clang::Token *start,
-                                  size_t len,
+void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token* start, size_t len,
                                  bool DisableMacroExpansion);

-} // namespace llcompat
+}  // namespace llcompat
@@ -17,12 +17,9 @@ template <typename T>
 class ReplacementsFrontendActionFactory : public ct::FrontendActionFactory {
    ct::Replacements* replacements;

-public:
-    explicit ReplacementsFrontendActionFactory(ct::Replacements* r):
-        ct::FrontendActionFactory(),
-        replacements(r) {}
+   public:
+    explicit ReplacementsFrontendActionFactory(ct::Replacements* r)
+        : ct::FrontendActionFactory(), replacements(r) {}

-    clang::FrontendAction* create() override {
-        return new T(replacements);
-    }
+    clang::FrontendAction* create() override { return new T(replacements); }
 };
@@ -4,26 +4,26 @@
 #include <iomanip>


-const char *counterNames[NUM_CONV_TYPES] = {
-    "version", "init", "device", "mem", "kern", "coord_func", "math_func", "device_func",
-    "special_func", "stream", "event", "occupancy", "ctx", "peer", "module",
-    "cache", "exec", "err", "def", "tex", "gl", "graphics",
-    "surface", "jit", "d3d9", "d3d10", "d3d11", "vdpau", "egl",
-    "thread", "other", "include", "include_cuda_main_header", "type", "literal",
-    "numeric_literal"
-};
+const char* counterNames[NUM_CONV_TYPES] = {"version",   "init",        "device",
+                                            "mem",       "kern",        "coord_func",
+                                            "math_func", "device_func", "special_func",
+                                            "stream",    "event",       "occupancy",
+                                            "ctx",       "peer",        "module",
+                                            "cache",     "exec",        "err",
+                                            "def",       "tex",         "gl",
+                                            "graphics",  "surface",     "jit",
+                                            "d3d9",      "d3d10",       "d3d11",
+                                            "vdpau",     "egl",         "thread",
+                                            "other",     "include",     "include_cuda_main_header",
+                                            "type",      "literal",     "numeric_literal"};

-const char *apiNames[NUM_API_TYPES] = {
-    "CUDA Driver API", "CUDA RT API", "CUBLAS API", "CURAND API"
-};
+const char* apiNames[NUM_API_TYPES] = {"CUDA Driver API", "CUDA RT API", "CUBLAS API",
+                                       "CURAND API"};

 namespace {

-template<typename ST, typename ST2>
-void conditionalPrint(ST *stream1,
-                      ST2* stream2,
-                      const std::string& s1,
-                      const std::string& s2) {
+template <typename ST, typename ST2>
+void conditionalPrint(ST* stream1, ST2* stream2, const std::string& s1, const std::string& s2) {
    if (stream1) {
        *stream1 << s1;
    }
@@ -37,8 +37,8 @@ void conditionalPrint(ST *stream1,
 /**
 * Print a named stat value to both the terminal and the CSV file.
 */
-template<typename T>
-void printStat(std::ostream *csv, llvm::raw_ostream* printOut, const std::string &name, T value) {
+template <typename T>
+void printStat(std::ostream* csv, llvm::raw_ostream* printOut, const std::string& name, T value) {
    if (printOut) {
        *printOut << "  " << name << ": " << value << "\n";
    }
@@ -49,12 +49,12 @@ void printStat(std::ostream *csv, llvm::raw_ostream* printOut, const std::string
 }


-} // Anonymous namespace
+}  // Anonymous namespace

 void StatCounter::incrementCounter(const hipCounter& counter, std::string name) {
    counters[name]++;
-    apiCounters[(int) counter.apiType]++;
-    convTypeCounters[(int) counter.type]++;
+    apiCounters[(int)counter.apiType]++;
+    convTypeCounters[(int)counter.type]++;
 }

 void StatCounter::add(const StatCounter& other) {
@@ -81,32 +81,36 @@ int StatCounter::getConvSum() {
 }

 void StatCounter::print(std::ostream* csv, llvm::raw_ostream* printOut, std::string prefix) {
-    conditionalPrint(csv, printOut, "\nCUDA ref type;Count\n", "[HIPIFY] info: " + prefix + " refs by type:\n");
+    conditionalPrint(csv, printOut, "\nCUDA ref type;Count\n",
+                     "[HIPIFY] info: " + prefix + " refs by type:\n");
    for (int i = 0; i < NUM_CONV_TYPES; i++) {
        if (convTypeCounters[i] > 0) {
            printStat(csv, printOut, counterNames[i], convTypeCounters[i]);
        }
    }

-    conditionalPrint(csv, printOut, "\nCUDA API;Count\n", "[HIPIFY] info: " + prefix + " refs by API:\n");
+    conditionalPrint(csv, printOut, "\nCUDA API;Count\n",
+                     "[HIPIFY] info: " + prefix + " refs by API:\n");
    for (int i = 0; i < NUM_API_TYPES; i++) {
        printStat(csv, printOut, apiNames[i], apiCounters[i]);
    }

-    conditionalPrint(csv, printOut, "\nCUDA ref name;Count\n", "[HIPIFY] info: " + prefix + " refs by names:\n");
-    for (const auto &it : counters) {
+    conditionalPrint(csv, printOut, "\nCUDA ref name;Count\n",
+                     "[HIPIFY] info: " + prefix + " refs by names:\n");
+    for (const auto& it : counters) {
        printStat(csv, printOut, it.first, it.second);
    }
 }


-Statistics::Statistics(std::string name): fileName(name) {
+Statistics::Statistics(std::string name) : fileName(name) {
    // Compute the total bytes/lines in the input file.
    std::ifstream src_file(name, std::ios::binary | std::ios::ate);
    src_file.clear();
    src_file.seekg(0);
-    totalLines = (int) std::count(std::istreambuf_iterator<char>(src_file), std::istreambuf_iterator<char>(), '\n');
-    totalBytes = (int) src_file.tellg();
+    totalLines = (int)std::count(std::istreambuf_iterator<char>(src_file),
+                                 std::istreambuf_iterator<char>(), '\n');
+    totalBytes = (int)src_file.tellg();

    // Mark the start time...
    startTime = chr::steady_clock::now();
@@ -115,7 +119,7 @@ Statistics::Statistics(std::string name): fileName(name) {

 ///////// Counter update routines //////////

-void Statistics::incrementCounter(const hipCounter &counter, std::string name) {
+void Statistics::incrementCounter(const hipCounter& counter, std::string name) {
    if (counter.unsupported) {
        unsupported.incrementCounter(counter, name);
    } else {
@@ -123,7 +127,7 @@ void Statistics::incrementCounter(const hipCounter &counter, std::string name) {
    }
 }

-void Statistics::add(const Statistics &other) {
+void Statistics::add(const Statistics& other) {
    supported.add(other.supported);
    unsupported.add(other.unsupported);
    totalBytes += other.totalBytes;
@@ -131,61 +135,60 @@ void Statistics::add(const Statistics &other) {
    touchedBytes += other.touchedBytes;
 }

-void Statistics::lineTouched(int lineNumber) {
-    touchedLines.insert(lineNumber);
-}
-void Statistics::bytesChanged(int bytes) {
-    touchedBytes += bytes;
-}
-void Statistics::markCompletion() {
-    completionTime = chr::steady_clock::now();
-}
+void Statistics::lineTouched(int lineNumber) { touchedLines.insert(lineNumber); }
+void Statistics::bytesChanged(int bytes) { touchedBytes += bytes; }
+void Statistics::markCompletion() { completionTime = chr::steady_clock::now(); }


 ///////// Output functions //////////

 void Statistics::print(std::ostream* csv, llvm::raw_ostream* printOut, bool skipHeader) {
-   if (!skipHeader) {
-       std::string str = "file \'" + fileName + "\' statistics:\n";
-       conditionalPrint(csv, printOut, "\n" + str, "\n[HIPIFY] info: " + str);
-   }
+    if (!skipHeader) {
+        std::string str = "file \'" + fileName + "\' statistics:\n";
+        conditionalPrint(csv, printOut, "\n" + str, "\n[HIPIFY] info: " + str);
+    }

-   size_t changedLines = touchedLines.size();
+    size_t changedLines = touchedLines.size();

-   // Total number of (un)supported refs that were converted.
-   int supportedSum = supported.getConvSum();
-   int unsupportedSum = unsupported.getConvSum();
+    // Total number of (un)supported refs that were converted.
+    int supportedSum = supported.getConvSum();
+    int unsupportedSum = unsupported.getConvSum();

-   printStat(csv, printOut, "CONVERTED refs count", supportedSum);
-   printStat(csv, printOut, "UNCONVERTED refs count", unsupportedSum);
-   printStat(csv, printOut, "CONVERSION %", 100 - std::lround(double(unsupportedSum * 100) / double(supportedSum + unsupportedSum)));
-   printStat(csv, printOut, "REPLACED bytes", touchedBytes);
-   printStat(csv, printOut, "TOTAL bytes", totalBytes);
-   printStat(csv, printOut, "CHANGED lines of code", changedLines);
-   printStat(csv, printOut, "TOTAL lines of code", totalLines);
+    printStat(csv, printOut, "CONVERTED refs count", supportedSum);
+    printStat(csv, printOut, "UNCONVERTED refs count", unsupportedSum);
+    printStat(
+        csv, printOut, "CONVERSION %",
+        100 - std::lround(double(unsupportedSum * 100) / double(supportedSum + unsupportedSum)));
+    printStat(csv, printOut, "REPLACED bytes", touchedBytes);
+    printStat(csv, printOut, "TOTAL bytes", totalBytes);
+    printStat(csv, printOut, "CHANGED lines of code", changedLines);
+    printStat(csv, printOut, "TOTAL lines of code", totalLines);

-   if (totalBytes > 0) {
-       printStat(csv, printOut, "CODE CHANGED (in bytes) %", std::lround(double(touchedBytes * 100) / double(totalBytes)));
-   }
+    if (totalBytes > 0) {
+        printStat(csv, printOut, "CODE CHANGED (in bytes) %",
+                  std::lround(double(touchedBytes * 100) / double(totalBytes)));
+    }

-   if (totalLines > 0) {
-       printStat(csv, printOut, "CODE CHANGED (in lines) %", std::lround(double(changedLines * 100) / double(totalLines)));
-   }
+    if (totalLines > 0) {
+        printStat(csv, printOut, "CODE CHANGED (in lines) %",
+                  std::lround(double(changedLines * 100) / double(totalLines)));
+    }

-   typedef std::chrono::duration<double, std::milli> duration;
-   duration elapsed = completionTime - startTime;
-   std::stringstream stream;
-   stream << std::fixed << std::setprecision(2) << elapsed.count() / 1000;
-   printStat(csv, printOut, "TIME ELAPSED s", stream.str());
+    typedef std::chrono::duration<double, std::milli> duration;
+    duration elapsed = completionTime - startTime;
+    std::stringstream stream;
+    stream << std::fixed << std::setprecision(2) << elapsed.count() / 1000;
+    printStat(csv, printOut, "TIME ELAPSED s", stream.str());

-   supported.print(csv, printOut, "CONVERTED");
-   unsupported.print(csv, printOut, "UNCONVERTED");
+    supported.print(csv, printOut, "CONVERTED");
+    unsupported.print(csv, printOut, "UNCONVERTED");
 }

-void Statistics::printAggregate(std::ostream *csv, llvm::raw_ostream* printOut) {
+void Statistics::printAggregate(std::ostream* csv, llvm::raw_ostream* printOut) {
    Statistics globalStats = getAggregate();

-    conditionalPrint(csv, printOut, "\nTOTAL statistics:\n", "\n[HIPIFY] info: TOTAL statistics:\n");
+    conditionalPrint(csv, printOut, "\nTOTAL statistics:\n",
+                     "\n[HIPIFY] info: TOTAL statistics:\n");

    // A file is considered "converted" if we made any changes to it.
    int convertedFiles = 0;
@@ -49,20 +49,14 @@ enum ConvTypes {
    CONV_NUMERIC_LITERAL,
    CONV_LAST
 };
-constexpr int NUM_CONV_TYPES = (int) ConvTypes::CONV_LAST;
+constexpr int NUM_CONV_TYPES = (int)ConvTypes::CONV_LAST;

-enum ApiTypes {
-    API_DRIVER = 0,
-    API_RUNTIME,
-    API_BLAS,
-    API_RAND,
-    API_LAST
-};
-constexpr int NUM_API_TYPES = (int) ApiTypes::API_LAST;
+enum ApiTypes { API_DRIVER = 0, API_RUNTIME, API_BLAS, API_RAND, API_LAST };
+constexpr int NUM_API_TYPES = (int)ApiTypes::API_LAST;

 // The names of various fields in in the statistics reports.
-extern const char *counterNames[NUM_CONV_TYPES];
-extern const char *apiNames[NUM_API_TYPES];
+extern const char* counterNames[NUM_CONV_TYPES];
+extern const char* apiNames[NUM_API_TYPES];


 struct hipCounter {
@@ -77,14 +71,14 @@ struct hipCounter {
 * Tracks a set of named counters, as well as counters for each of the type enums defined above.
 */
 class StatCounter {
-private:
+   private:
    // Each thing we track is either "supported" or "unsupported"...
    std::map<std::string, int> counters;

    int apiCounters[NUM_API_TYPES] = {};
    int convTypeCounters[NUM_CONV_TYPES] = {};

-public:
+   public:
    void incrementCounter(const hipCounter& counter, std::string name);

    /**
@@ -115,15 +109,15 @@ class Statistics {
    chr::steady_clock::time_point startTime;
    chr::steady_clock::time_point completionTime;

-public:
+   public:
    Statistics(std::string name);

-    void incrementCounter(const hipCounter &counter, std::string name);
+    void incrementCounter(const hipCounter& counter, std::string name);

    /**
     * Add the counters from `other` onto the counters of this object.
     */
-    void add(const Statistics &other);
+    void add(const Statistics& other);

    void lineTouched(int lineNumber);
    void bytesChanged(int bytes);
@@ -135,18 +129,18 @@ public:

    /////// Output functions ///////

-public:
-   /**
+   public:
+    /**
     * Pretty-print the statistics stored in this object.
     *
     * @param csv Pointer to an output stream for the CSV to write. If null, no CSV is written
-     * @param printOut Pointer to an output stream to print human-readable textual stats to. If null, no
-     *                 such stats are produced.
+     * @param printOut Pointer to an output stream to print human-readable textual stats to. If
+     * null, no such stats are produced.
     */
    void print(std::ostream* csv, llvm::raw_ostream* printOut, bool skipHeader = false);

    /// Print aggregated statistics for all registered counters.
-    static void printAggregate(std::ostream *csv, llvm::raw_ostream* printOut);
+    static void printAggregate(std::ostream* csv, llvm::raw_ostream* printOut);

    /////// Static nonsense ///////

@@ -162,15 +156,15 @@ public:
    static Statistics getAggregate();

    /**
-     * Convenient global entry point for updating the "active" Statistics. Since we operate single-threadedly
-     * processing one file at a time, this allows us to simply expose the stats for the current file globally,
-     * simplifying things.
+     * Convenient global entry point for updating the "active" Statistics. Since we operate
+     * single-threadedly processing one file at a time, this allows us to simply expose the stats
+     * for the current file globally, simplifying things.
     */
    static Statistics& current();

    /**
-     * Set the active Statistics object to the named one, creating it if necessary, and write the completion
-     * timestamp into the currently active one.
+     * Set the active Statistics object to the named one, creating it if necessary, and write the
+     * completion timestamp into the currently active one.
     */
    static void setActive(std::string name);
 };
@@ -8,7 +8,7 @@ llvm::StringRef unquoteStr(llvm::StringRef s) {
    return s;
 }

-void removePrefixIfPresent(std::string &s, std::string prefix) {
+void removePrefixIfPresent(std::string& s, std::string prefix) {
    if (s.find(prefix) != 0) {
        return;
    }
@@ -11,4 +11,4 @@ llvm::StringRef unquoteStr(llvm::StringRef s);
 /**
 * If `s` starts with `prefix`, remove it. Otherwise, does nothing.
 */
-void removePrefixIfPresent(std::string &s, std::string prefix);
+void removePrefixIfPresent(std::string& s, std::string prefix);
@@ -46,111 +46,116 @@ namespace ct = clang::tooling;
 namespace {

 void copyFile(const std::string& src, const std::string& dst) {
-  std::ifstream source(src, std::ios::binary);
-  std::ofstream dest(dst, std::ios::binary);
-  dest << source.rdbuf();
+    std::ifstream source(src, std::ios::binary);
+    std::ofstream dest(dst, std::ios::binary);
+    dest << source.rdbuf();
 }

-} // anonymous namespace
+}  // anonymous namespace

-int main(int argc, const char **argv) {
-  llcompat::PrintStackTraceOnErrorSignal();
+int main(int argc, const char** argv) {
+    llcompat::PrintStackTraceOnErrorSignal();

-  ct::CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore);
-  std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
-  std::string dst = OutputFilename;
-  if (!dst.empty() && fileSources.size() > 1) {
-    llvm::errs() << "[HIPIFY] conflict: -o and multiple source files are specified.\n";
-    return 1;
-  }
-
-  if (NoOutput) {
-    if (Inplace) {
-      llvm::errs() << "[HIPIFY] conflict: both -no-output and -inplace options are specified.\n";
-      return 1;
-    }
-    if (!dst.empty()) {
-      llvm::errs() << "[HIPIFY] conflict: both -no-output and -o options are specified.\n";
-      return 1;
-    }
-  }
-
-  if (Examine) {
-    NoOutput = PrintStats = true;
-  }
-
-  int Result = 0;
-
-  // Arguments for the Statistics print routines.
-  std::unique_ptr<std::ostream> csv = nullptr;
-  llvm::raw_ostream* statPrint = nullptr;
-  if (!OutputStatsFilename.empty()) {
-    csv = std::unique_ptr<std::ostream>(new std::ofstream(OutputStatsFilename, std::ios_base::trunc));
-  }
-  if (PrintStats) {
-    statPrint = &llvm::errs();
-  }
-
-  for (const auto & src : fileSources) {
-    if (dst.empty()) {
-      if (Inplace) {
-        dst = src;
-      } else {
-        dst = src + ".hip";
-      }
-    } else if (Inplace) {
-      llvm::errs() << "[HIPIFY] conflict: both -o and -inplace options are specified.\n";
-      return 1;
+    ct::CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore);
+    std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
+    std::string dst = OutputFilename;
+    if (!dst.empty() && fileSources.size() > 1) {
+        llvm::errs() << "[HIPIFY] conflict: -o and multiple source files are specified.\n";
+        return 1;
    }

-    std::string tmpFile = src + ".hipify-tmp";
+    if (NoOutput) {
+        if (Inplace) {
+            llvm::errs()
+                << "[HIPIFY] conflict: both -no-output and -inplace options are specified.\n";
+            return 1;
+        }
+        if (!dst.empty()) {
+            llvm::errs() << "[HIPIFY] conflict: both -no-output and -o options are specified.\n";
+            return 1;
+        }
+    }

-    // Create a copy of the file to work on. When we're done, we'll move this onto the
-    // output (which may mean overwriting the input, if we're in-place).
-    // Should we fail for some reason, we'll just leak this file and not corrupt the input.
-    copyFile(src, tmpFile);
+    if (Examine) {
+        NoOutput = PrintStats = true;
+    }

-    // Initialise the statistics counters for this file.
-    Statistics::setActive(src);
+    int Result = 0;

-    // RefactoringTool operates on the file in-place. Giving it the output path is no good,
-    // because that'll break relative includes, and we don't want to overwrite the input file.
-    // So what we do is operate on a copy, which we then move to the output.
-    ct::RefactoringTool Tool(OptionsParser.getCompilations(), tmpFile);
-    ct::Replacements& replacementsToUse = llcompat::getReplacements(Tool, tmpFile);
+    // Arguments for the Statistics print routines.
+    std::unique_ptr<std::ostream> csv = nullptr;
+    llvm::raw_ostream* statPrint = nullptr;
+    if (!OutputStatsFilename.empty()) {
+        csv = std::unique_ptr<std::ostream>(
+            new std::ofstream(OutputStatsFilename, std::ios_base::trunc));
+    }
+    if (PrintStats) {
+        statPrint = &llvm::errs();
+    }

-    ReplacementsFrontendActionFactory<HipifyAction> actionFactory(&replacementsToUse);
+    for (const auto& src : fileSources) {
+        if (dst.empty()) {
+            if (Inplace) {
+                dst = src;
+            } else {
+                dst = src + ".hip";
+            }
+        } else if (Inplace) {
+            llvm::errs() << "[HIPIFY] conflict: both -o and -inplace options are specified.\n";
+            return 1;
+        }

-    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
+        std::string tmpFile = src + ".hipify-tmp";

-    // Ensure at least c++11 is used.
-    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-std=c++11", ct::ArgumentInsertPosition::BEGIN));
+        // Create a copy of the file to work on. When we're done, we'll move this onto the
+        // output (which may mean overwriting the input, if we're in-place).
+        // Should we fail for some reason, we'll just leak this file and not corrupt the input.
+        copyFile(src, tmpFile);
+
+        // Initialise the statistics counters for this file.
+        Statistics::setActive(src);
+
+        // RefactoringTool operates on the file in-place. Giving it the output path is no good,
+        // because that'll break relative includes, and we don't want to overwrite the input file.
+        // So what we do is operate on a copy, which we then move to the output.
+        ct::RefactoringTool Tool(OptionsParser.getCompilations(), tmpFile);
+        ct::Replacements& replacementsToUse = llcompat::getReplacements(Tool, tmpFile);
+
+        ReplacementsFrontendActionFactory<HipifyAction> actionFactory(&replacementsToUse);
+
+        Tool.appendArgumentsAdjuster(
+            ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
+
+        // Ensure at least c++11 is used.
+        Tool.appendArgumentsAdjuster(
+            ct::getInsertArgumentAdjuster("-std=c++11", ct::ArgumentInsertPosition::BEGIN));
 #if defined(HIPIFY_CLANG_RES)
-    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
+        Tool.appendArgumentsAdjuster(
+            ct::getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
 #endif
-    Tool.appendArgumentsAdjuster(ct::getClangSyntaxOnlyAdjuster());
+        Tool.appendArgumentsAdjuster(ct::getClangSyntaxOnlyAdjuster());

-    // Hipify _all_ the things!
-    if (Tool.runAndSave(&actionFactory)) {
-      DEBUG(llvm::dbgs() << "Skipped some replacements.\n");
+        // Hipify _all_ the things!
+        if (Tool.runAndSave(&actionFactory)) {
+            DEBUG(llvm::dbgs() << "Skipped some replacements.\n");
+        }
+
+        // Either move the tmpfile to the output, or remove it.
+        if (!NoOutput) {
+            rename(tmpFile.c_str(), dst.c_str());
+        } else {
+            remove(tmpFile.c_str());
+        }
+
+        Statistics::current().markCompletion();
+        Statistics::current().print(csv.get(), statPrint);
+
+        dst.clear();
    }

-    // Either move the tmpfile to the output, or remove it.
-    if (!NoOutput) {
-      rename(tmpFile.c_str(), dst.c_str());
-    } else {
-      remove(tmpFile.c_str());
+    if (fileSources.size() > 1) {
+        Statistics::printAggregate(csv.get(), statPrint);
    }

-    Statistics::current().markCompletion();
-    Statistics::current().print(csv.get(), statPrint);
-
-    dst.clear();
-  }
-
-  if (fileSources.size() > 1) {
-    Statistics::printAggregate(csv.get(), statPrint);
-  }
-
-  return Result;
+    return Result;
 }
@@ -28,9 +28,9 @@ THE SOFTWARE.
 // on NVCC path:


-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/channel_descriptor.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include <hip/nvcc_detail/channel_descriptor.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -23,11 +23,11 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_DEVICE_FUNCTIONS_H
 #define HIP_INCLUDE_HIP_DEVICE_FUNCTIONS_H

-#include<hip/hip_common.h>
+#include <hip/hip_common.h>

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/device_functions.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include <device_functions.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -25,9 +25,9 @@ THE SOFTWARE.

 #include <hip/hip_common.h>

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/driver_types.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include "driver_types.h"
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -23,359 +23,313 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_CHANNEL_DESCRIPTOR_H
 #define HIP_INCLUDE_HIP_HCC_DETAIL_CHANNEL_DESCRIPTOR_H

-#include<hip/hcc_detail/driver_types.h>
-#include<hip/hcc_detail/hip_vector_types.h>
+#include <hip/hcc_detail/driver_types.h>
+#include <hip/hcc_detail/hip_vector_types.h>

 #ifdef __cplusplus

 hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);

 static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

 static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-static inline hipChannelFormatDesc hipCreateChannelDescHalf2()
-{
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-template<typename T>
+template <typename T>
 static inline hipChannelFormatDesc hipCreateChannelDesc() {
-  return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
+    return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<char>()
-{
-  int e = (int)sizeof(char) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
+    int e = (int)sizeof(char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed char>()
-{
-  int e = (int)sizeof(signed char) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>()
-{
-  int e = (int)sizeof(unsigned char) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>()
-{
-  int e = (int)sizeof(unsigned char) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<char1>()
-{
-  int e = (int)sizeof(signed char) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>()
-{
-  int e = (int)sizeof(unsigned char) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<char2>()
-{
-  int e = (int)sizeof(signed char) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>()
-{
-  int e = (int)sizeof(unsigned char) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<char3>()
-{
-  int e = (int)sizeof(signed char) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>()
-{
-  int e = (int)sizeof(unsigned char) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<char4>()
-{
-  int e = (int)sizeof(signed char) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>()
-{
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed short>()
-{
-  int e = (int)sizeof(signed short) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>()
-{
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<short1>()
-{
-  int e = (int)sizeof(signed short) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>()
-{
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<short2>()
-{
-  int e = (int)sizeof(signed short) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>()
-{
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<short3>()
-{
-  int e = (int)sizeof(signed short) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>()
-{
-  int e = (int)sizeof(unsigned short) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<short4>()
-{
-  int e = (int)sizeof(signed short) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>()
-{
-  int e = (int)sizeof(unsigned int) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed int>()
-{
-  int e = (int)sizeof(signed int) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint1>()
-{
-  int e = (int)sizeof(unsigned int) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<int1>()
-{
-  int e = (int)sizeof(signed int) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint2>()
-{
-  int e = (int)sizeof(unsigned int) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<int2>()
-{
-  int e = (int)sizeof(signed int) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint3>()
-{
-  int e = (int)sizeof(unsigned int) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<int3>()
-{
-  int e = (int)sizeof(signed int) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint4>()
-{
-  int e = (int)sizeof(unsigned int) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<int4>()
-{
-  int e = (int)sizeof(signed int) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<float>()
-{
-  int e = (int)sizeof(float) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<float1>()
-{
-  int e = (int)sizeof(float) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<float2>()
-{
-  int e = (int)sizeof(float) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<float3>()
-{
-  int e = (int)sizeof(float) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<float4>()
-{
-  int e = (int)sizeof(float) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>()
-{
-  int e = (int)sizeof(unsigned long) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed long>()
-{
-  int e = (int)sizeof(signed long) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>()
-{
-  int e = (int)sizeof(unsigned long) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<long1>()
-{
-  int e = (int)sizeof(signed long) * 8;
-  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>()
-{
-  int e = (int)sizeof(unsigned long) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<long2>()
-{
-  int e = (int)sizeof(signed long) * 8;
-  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>()
-{
-  int e = (int)sizeof(unsigned long) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<long3>()
-{
-  int e = (int)sizeof(signed long) * 8;
-  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>()
-{
-  int e = (int)sizeof(unsigned long) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
 }

-template<>
-inline hipChannelFormatDesc hipCreateChannelDesc<long4>()
-{
-  int e = (int)sizeof(signed long) * 8;
-  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

 #else

-struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, enum hipChannelFormatKind f);
+struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                 enum hipChannelFormatKind f);

 #endif

@@ -32,129 +32,103 @@ THE SOFTWARE.
 #include <utility>
 #include <vector>

-namespace hip_impl
-{
-    hsa_isa_t triple_to_hsa_isa(const std::string& triple);
+namespace hip_impl {
+hsa_isa_t triple_to_hsa_isa(const std::string& triple);

-    struct Bundled_code {
-        union Header {
-            struct {
-                std::uint64_t offset;
-                std::uint64_t bundle_sz;
-                std::uint64_t triple_sz;
-            };
-            char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
-        } header;
-        std::string triple;
-        std::vector<char> blob;
-    };
+struct Bundled_code {
+    union Header {
+        struct {
+            std::uint64_t offset;
+            std::uint64_t bundle_sz;
+            std::uint64_t triple_sz;
+        };
+        char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
+    } header;
+    std::string triple;
+    std::vector<char> blob;
+};

-    class Bundled_code_header {
-        // DATA - STATICS
-        static constexpr const char magic_string_[] =
-            "__CLANG_OFFLOAD_BUNDLE__";
-        static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
+class Bundled_code_header {
+    // DATA - STATICS
+    static constexpr const char magic_string_[] = "__CLANG_OFFLOAD_BUNDLE__";
+    static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;

-        // DATA
-        union Header_ {
-            struct {
-                char bundler_magic_string_[magic_string_sz_];
-                std::uint64_t bundle_cnt_;
-            };
-            char cbuf_[sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)];
-        } header_;
-        std::vector<Bundled_code> bundles_;
+    // DATA
+    union Header_ {
+        struct {
+            char bundler_magic_string_[magic_string_sz_];
+            std::uint64_t bundle_cnt_;
+        };
+        char cbuf_[sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)];
+    } header_;
+    std::vector<Bundled_code> bundles_;

-        // FRIENDS - MANIPULATORS
-        template<typename RandomAccessIterator>
-        friend
-        inline
-        bool read(
-            RandomAccessIterator f,
-            RandomAccessIterator l,
-            Bundled_code_header& x)
-        {
-            if (f == l) return false;
+    // FRIENDS - MANIPULATORS
+    template <typename RandomAccessIterator>
+    friend inline bool read(RandomAccessIterator f, RandomAccessIterator l,
+                            Bundled_code_header& x) {
+        if (f == l) return false;

-            std::copy_n(f, sizeof(x.header_.cbuf_), x.header_.cbuf_);
+        std::copy_n(f, sizeof(x.header_.cbuf_), x.header_.cbuf_);

-            if (valid(x)) {
-                x.bundles_.resize(x.header_.bundle_cnt_);
+        if (valid(x)) {
+            x.bundles_.resize(x.header_.bundle_cnt_);

-                auto it = f + sizeof(x.header_.cbuf_);
-                for (auto&& y : x.bundles_) {
-                    std::copy_n(it, sizeof(y.header.cbuf), y.header.cbuf);
-                    it += sizeof(y.header.cbuf);
+            auto it = f + sizeof(x.header_.cbuf_);
+            for (auto&& y : x.bundles_) {
+                std::copy_n(it, sizeof(y.header.cbuf), y.header.cbuf);
+                it += sizeof(y.header.cbuf);

-                    y.triple.assign(it, it + y.header.triple_sz);
+                y.triple.assign(it, it + y.header.triple_sz);

-                    std::copy_n(
-                        f + y.header.offset,
-                        y.header.bundle_sz,
-                        std::back_inserter(y.blob));
+                std::copy_n(f + y.header.offset, y.header.bundle_sz, std::back_inserter(y.blob));

-                    it += y.header.triple_sz;
-                }
-
-                return true;
+                it += y.header.triple_sz;
            }

-            return false;
-        }
-        friend
-        inline
-        bool read(const std::vector<char>& blob, Bundled_code_header& x)
-        {
-            return read(blob.cbegin(), blob.cend(), x);
-        }
-        friend
-        inline
-        bool read(std::istream& is, Bundled_code_header& x)
-        {
-            return read(std::vector<char>{
-                std::istreambuf_iterator<char>{is},
-                std::istreambuf_iterator<char>{}},
-                x);
+            return true;
        }

-        // FRIENDS - ACCESSORS
-        friend
-        inline
-        bool valid(const Bundled_code_header& x)
-        {
-            return std::equal(
-                magic_string_,
-                magic_string_ + magic_string_sz_,
-                x.header_.bundler_magic_string_);
-        }
-        friend
-        inline
-        const std::vector<Bundled_code>& bundles(const Bundled_code_header& x)
-        {
-            return x.bundles_;
-        }
-    public:
-        // CREATORS
-        Bundled_code_header() = default;
-        template<typename RandomAccessIterator>
-        Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
-        explicit
-        Bundled_code_header(const std::vector<char>& blob);
-        explicit
-        Bundled_code_header(const void* maybe_blob);
-        Bundled_code_header(const Bundled_code_header&) = default;
-        Bundled_code_header(Bundled_code_header&&) = default;
-        ~Bundled_code_header() = default;
-
-        // MANIPULATORS
-        Bundled_code_header& operator=(const Bundled_code_header&) = default;
-        Bundled_code_header& operator=(Bundled_code_header&&) = default;
-    };
-
-    // CREATORS
-    template<typename RandomAccessIterator>
-    Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l) : Bundled_code_header{}
-    {
-        read(f, l, *this);
+        return false;
    }
-} // Namespace hip_impl.
+    friend inline bool read(const std::vector<char>& blob, Bundled_code_header& x) {
+        return read(blob.cbegin(), blob.cend(), x);
+    }
+    friend inline bool read(std::istream& is, Bundled_code_header& x) {
+        return read(
+            std::vector<char>{std::istreambuf_iterator<char>{is}, std::istreambuf_iterator<char>{}},
+            x);
+    }
+
+    // FRIENDS - ACCESSORS
+    friend inline bool valid(const Bundled_code_header& x) {
+        return std::equal(magic_string_, magic_string_ + magic_string_sz_,
+                          x.header_.bundler_magic_string_);
+    }
+    friend inline const std::vector<Bundled_code>& bundles(const Bundled_code_header& x) {
+        return x.bundles_;
+    }
+
+   public:
+    // CREATORS
+    Bundled_code_header() = default;
+    template <typename RandomAccessIterator>
+    Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
+    explicit Bundled_code_header(const std::vector<char>& blob);
+    explicit Bundled_code_header(const void* maybe_blob);
+    Bundled_code_header(const Bundled_code_header&) = default;
+    Bundled_code_header(Bundled_code_header&&) = default;
+    ~Bundled_code_header() = default;
+
+    // MANIPULATORS
+    Bundled_code_header& operator=(const Bundled_code_header&) = default;
+    Bundled_code_header& operator=(Bundled_code_header&&) = default;
+};
+
+// CREATORS
+template <typename RandomAccessIterator>
+Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l)
+    : Bundled_code_header{} {
+    read(f, l, *this);
+}
+}  // Namespace hip_impl.
@@ -22,9 +22,9 @@ THE SOFTWARE.

 #pragma once

-namespace hip_impl // Documentation only.
+namespace hip_impl  // Documentation only.
 {
-    #define requires(...)
+#define requires(...)

-    #define FunctionalProcedure typename
-}
+#define FunctionalProcedure typename
+}  // namespace hip_impl
@@ -27,84 +27,81 @@ THE SOFTWARE.
 #include <hip/hip_vector_types.h>


-
-
-
 // Single Precision Fast Math
-__device__  float __cosf(float x);
-__device__  float __exp10f(float x);
-__device__  float __expf(float x);
-__device__ static  float __fadd_rd(float x, float y);
-__device__ static  float __fadd_rn(float x, float y);
-__device__ static  float __fadd_ru(float x, float y);
-__device__ static  float __fadd_rz(float x, float y);
-__device__ static  float __fdiv_rd(float x, float y);
-__device__ static  float __fdiv_rn(float x, float y);
-__device__ static  float __fdiv_ru(float x, float y);
-__device__ static  float __fdiv_rz(float x, float y);
-__device__ static  float __fdividef(float x, float y);
-__device__  float __fmaf_rd(float x, float y, float z);
-__device__  float __fmaf_rn(float x, float y, float z);
-__device__  float __fmaf_ru(float x, float y, float z);
-__device__  float __fmaf_rz(float x, float y, float z);
-__device__ static  float __fmul_rd(float x, float y);
-__device__ static  float __fmul_rn(float x, float y);
-__device__ static  float __fmul_ru(float x, float y);
-__device__ static  float __fmul_rz(float x, float y);
-__device__  float __frcp_rd(float x);
-__device__  float __frcp_rn(float x);
-__device__  float __frcp_ru(float x);
-__device__  float __frcp_rz(float x);
-__device__  float __frsqrt_rn(float x);
-__device__  float __fsqrt_rd(float x);
-__device__  float __fsqrt_rn(float x);
-__device__  float __fsqrt_ru(float x);
-__device__  float __fsqrt_rz(float x);
-__device__ static  float __fsub_rd(float x, float y);
-__device__ static  float __fsub_rn(float x, float y);
-__device__ static  float __fsub_ru(float x, float y);
-__device__  float __log10f(float x);
-__device__  float __log2f(float x);
-__device__  float __logf(float x);
-__device__  float __powf(float base, float exponent);
-__device__ static  float __saturatef(float x);
-__device__  void __sincosf(float x, float *s, float *c);
-__device__  float __sinf(float x);
-__device__  float __tanf(float x);
+__device__ float __cosf(float x);
+__device__ float __exp10f(float x);
+__device__ float __expf(float x);
+__device__ static float __fadd_rd(float x, float y);
+__device__ static float __fadd_rn(float x, float y);
+__device__ static float __fadd_ru(float x, float y);
+__device__ static float __fadd_rz(float x, float y);
+__device__ static float __fdiv_rd(float x, float y);
+__device__ static float __fdiv_rn(float x, float y);
+__device__ static float __fdiv_ru(float x, float y);
+__device__ static float __fdiv_rz(float x, float y);
+__device__ static float __fdividef(float x, float y);
+__device__ float __fmaf_rd(float x, float y, float z);
+__device__ float __fmaf_rn(float x, float y, float z);
+__device__ float __fmaf_ru(float x, float y, float z);
+__device__ float __fmaf_rz(float x, float y, float z);
+__device__ static float __fmul_rd(float x, float y);
+__device__ static float __fmul_rn(float x, float y);
+__device__ static float __fmul_ru(float x, float y);
+__device__ static float __fmul_rz(float x, float y);
+__device__ float __frcp_rd(float x);
+__device__ float __frcp_rn(float x);
+__device__ float __frcp_ru(float x);
+__device__ float __frcp_rz(float x);
+__device__ float __frsqrt_rn(float x);
+__device__ float __fsqrt_rd(float x);
+__device__ float __fsqrt_rn(float x);
+__device__ float __fsqrt_ru(float x);
+__device__ float __fsqrt_rz(float x);
+__device__ static float __fsub_rd(float x, float y);
+__device__ static float __fsub_rn(float x, float y);
+__device__ static float __fsub_ru(float x, float y);
+__device__ float __log10f(float x);
+__device__ float __log2f(float x);
+__device__ float __logf(float x);
+__device__ float __powf(float base, float exponent);
+__device__ static float __saturatef(float x);
+__device__ void __sincosf(float x, float* s, float* c);
+__device__ float __sinf(float x);
+__device__ float __tanf(float x);


 /*
 Double Precision Intrinsics
 */

-__device__ static  double __dadd_rd(double x, double y);
-__device__ static  double __dadd_rn(double x, double y);
-__device__ static  double __dadd_ru(double x, double y);
-__device__ static  double __dadd_rz(double x, double y);
-__device__ static  double __ddiv_rd(double x, double y);
-__device__ static  double __ddiv_rn(double x, double y);
-__device__ static  double __ddiv_ru(double x, double y);
-__device__ static  double __ddiv_rz(double x, double y);
-__device__ static  double __dmul_rd(double x, double y);
-__device__ static  double __dmul_rn(double x, double y);
-__device__ static  double __dmul_ru(double x, double y);
-__device__ static  double __dmul_rz(double x, double y);
-__device__  double __drcp_rd(double x);
-__device__  double __drcp_rn(double x);
-__device__  double __drcp_ru(double x);
-__device__  double __drcp_rz(double x);
-__device__  double __dsqrt_rd(double x);
-__device__  double __dsqrt_rn(double x);
-__device__  double __dsqrt_ru(double x);
-__device__  double __dsqrt_rz(double x);
-__device__ static  double __dsub_rd(double x, double y);
-__device__ static  double __dsub_rn(double x, double y);
-__device__ static  double __dsub_ru(double x, double y);
-__device__ static  double __dsub_rz(double x, double y);
-__device__  double __fma_rd(double x, double y, double z);
-__device__  double __fma_rn(double x, double y, double z);
-__device__  double __fma_ru(double x, double y, double z);
-__device__  double __fma_rz(double x, double y, double z);
+__device__ static double __dadd_rd(double x, double y);
+__device__ static double __dadd_rn(double x, double y);
+__device__ static double __dadd_ru(double x, double y);
+__device__ static double __dadd_rz(double x, double y);
+__device__ static double __ddiv_rd(double x, double y);
+__device__ static double __ddiv_rn(double x, double y);
+__device__ static double __ddiv_ru(double x, double y);
+__device__ static double __ddiv_rz(double x, double y);
+__device__ static double __dmul_rd(double x, double y);
+__device__ static double __dmul_rn(double x, double y);
+__device__ static double __dmul_ru(double x, double y);
+__device__ static double __dmul_rz(double x, double y);
+__device__ double __drcp_rd(double x);
+__device__ double __drcp_rn(double x);
+__device__ double __drcp_ru(double x);
+__device__ double __drcp_rz(double x);
+__device__ double __dsqrt_rd(double x);
+__device__ double __dsqrt_rn(double x);
+__device__ double __dsqrt_ru(double x);
+__device__ double __dsqrt_rz(double x);
+__device__ static double __dsub_rd(double x, double y);
+__device__ static double __dsub_rn(double x, double y);
+__device__ static double __dsub_ru(double x, double y);
+__device__ static double __dsub_rz(double x, double y);
+__device__ double __fma_rd(double x, double y, double z);
+__device__ double __fma_rn(double x, double y, double z);
+__device__ double __fma_ru(double x, double y, double z);
+__device__ double __fma_rz(double x, double y, double z);

 // Single Precision Fast Math
 extern __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32");
@@ -120,302 +117,170 @@ __device__ float __hip_fast_log10f(float);
 extern __attribute__((const)) float __hip_fast_log2f(float) __asm("llvm.log2.f32");
 __device__ float __hip_fast_logf(float);
 __device__ float __hip_fast_powf(float, float);
-__device__ void __hip_fast_sincosf(float,float*,float*);
+__device__ void __hip_fast_sincosf(float, float*, float*);
 extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
 __device__ float __hip_fast_tanf(float);
-extern __attribute__((const)) float __hip_fast_fmaf(float,float,float) __asm("llvm.fma.f32");
+extern __attribute__((const)) float __hip_fast_fmaf(float, float, float) __asm("llvm.fma.f32");
 extern __attribute__((const)) float __hip_fast_frcp(float) __asm("llvm.amdgcn.rcp.f32");

 extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
-extern __attribute__((const)) double __hip_fast_fma(double,double,double) __asm("llvm.fma.f64");
+extern __attribute__((const)) double __hip_fast_fma(double, double, double) __asm("llvm.fma.f64");
 extern __attribute__((const)) double __hip_fast_drcp(double) __asm("llvm.amdgcn.rcp.f64");


 // Single Precision Fast Math
-__device__ inline float __cosf(float x) {
-  return __hip_fast_cosf(x);
-}
+__device__ inline float __cosf(float x) { return __hip_fast_cosf(x); }

-__device__ inline float __exp10f(float x) {
-  return __hip_fast_exp10f(x);
-}
+__device__ inline float __exp10f(float x) { return __hip_fast_exp10f(x); }

-__device__ inline float __expf(float x) {
-  return __hip_fast_expf(x);
-}
+__device__ inline float __expf(float x) { return __hip_fast_expf(x); }

-__device__ static inline float __fadd_rd(float x, float y) {
-  return x + y;
-}
+__device__ static inline float __fadd_rd(float x, float y) { return x + y; }

-__device__ static inline float __fadd_rn(float x, float y) {
-  return x + y;
-}
+__device__ static inline float __fadd_rn(float x, float y) { return x + y; }

-__device__ static inline float __fadd_ru(float x, float y) {
-  return x + y;
-}
+__device__ static inline float __fadd_ru(float x, float y) { return x + y; }

-__device__ static inline float __fadd_rz(float x, float y) {
-  return x + y;
-}
+__device__ static inline float __fadd_rz(float x, float y) { return x + y; }

-__device__ static inline float __fdiv_rd(float x, float y) {
-  return x / y;
-}
+__device__ static inline float __fdiv_rd(float x, float y) { return x / y; }

-__device__ static inline float __fdiv_rn(float x, float y) {
-  return x / y;
-}
+__device__ static inline float __fdiv_rn(float x, float y) { return x / y; }

-__device__ static inline float __fdiv_ru(float x, float y) {
-  return x / y;
-}
+__device__ static inline float __fdiv_ru(float x, float y) { return x / y; }

-__device__ static inline float __fdiv_rz(float x, float y) {
-  return x / y;
-}
+__device__ static inline float __fdiv_rz(float x, float y) { return x / y; }

-__device__ static inline float __fdividef(float x, float y) {
-  return x / y;
-}
+__device__ static inline float __fdividef(float x, float y) { return x / y; }

-__device__ inline float __fmaf_rd(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
+__device__ inline float __fmaf_rd(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }

-__device__ inline float __fmaf_rn(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
+__device__ inline float __fmaf_rn(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }

-__device__ inline float __fmaf_ru(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
+__device__ inline float __fmaf_ru(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }

-__device__ inline float __fmaf_rz(float x, float y, float z) {
-  return __hip_fast_fmaf(x, y, z);
-}
+__device__ inline float __fmaf_rz(float x, float y, float z) { return __hip_fast_fmaf(x, y, z); }

-__device__ static inline float __fmul_rd(float x, float y) {
-  return x * y;
-}
+__device__ static inline float __fmul_rd(float x, float y) { return x * y; }

-__device__ static inline float __fmul_rn(float x, float y) {
-  return x * y;
-}
+__device__ static inline float __fmul_rn(float x, float y) { return x * y; }

-__device__ static inline float __fmul_ru(float x, float y) {
-  return x * y;
-}
+__device__ static inline float __fmul_ru(float x, float y) { return x * y; }

-__device__ static inline float __fmul_rz(float x, float y) {
-  return x * y;
-}
+__device__ static inline float __fmul_rz(float x, float y) { return x * y; }

-__device__ inline float __frcp_rd(float x) {
-  return __hip_fast_frcp(x);
-}
+__device__ inline float __frcp_rd(float x) { return __hip_fast_frcp(x); }

-__device__ inline float __frcp_rn(float x) {
-  return __hip_fast_frcp(x);
-}
+__device__ inline float __frcp_rn(float x) { return __hip_fast_frcp(x); }

-__device__ inline float __frcp_ru(float x) {
-  return __hip_fast_frcp(x);
-}
+__device__ inline float __frcp_ru(float x) { return __hip_fast_frcp(x); }

-__device__ inline float __frcp_rz(float x) {
-  return __hip_fast_frcp(x);
-}
+__device__ inline float __frcp_rz(float x) { return __hip_fast_frcp(x); }

-__device__ inline float __frsqrt_rn(float x) {
-  return __hip_fast_frsqrt_rn(x);
-}
+__device__ inline float __frsqrt_rn(float x) { return __hip_fast_frsqrt_rn(x); }

-__device__ inline float __fsqrt_rd(float x) {
-  return __hip_fast_fsqrt_rd(x);
-}
+__device__ inline float __fsqrt_rd(float x) { return __hip_fast_fsqrt_rd(x); }

-__device__ inline float __fsqrt_rn(float x) {
-  return __hip_fast_fsqrt_rn(x);
-}
+__device__ inline float __fsqrt_rn(float x) { return __hip_fast_fsqrt_rn(x); }

-__device__ inline float __fsqrt_ru(float x) {
-  return __hip_fast_fsqrt_ru(x);
-}
+__device__ inline float __fsqrt_ru(float x) { return __hip_fast_fsqrt_ru(x); }

-__device__ inline float __fsqrt_rz(float x) {
-  return __hip_fast_fsqrt_rz(x);
-}
+__device__ inline float __fsqrt_rz(float x) { return __hip_fast_fsqrt_rz(x); }

-__device__ static inline float __fsub_rd(float x, float y) {
-  return x - y;
-}
+__device__ static inline float __fsub_rd(float x, float y) { return x - y; }

-__device__ static inline float __fsub_rn(float x, float y) {
-  return x - y;
-}
+__device__ static inline float __fsub_rn(float x, float y) { return x - y; }

-__device__ static inline float __fsub_ru(float x, float y) {
-  return x - y;
-}
+__device__ static inline float __fsub_ru(float x, float y) { return x - y; }

-__device__ static inline float __fsub_rz(float x, float y) {
-  return x - y;
-}
+__device__ static inline float __fsub_rz(float x, float y) { return x - y; }


-__device__ inline float __log10f(float x) {
-  return __hip_fast_log10f(x);
-}
+__device__ inline float __log10f(float x) { return __hip_fast_log10f(x); }

-__device__ inline float __log2f(float x) {
-  return __hip_fast_log2f(x);
-}
+__device__ inline float __log2f(float x) { return __hip_fast_log2f(x); }

-__device__ inline float __logf(float x) {
-  return __hip_fast_logf(x);
-}
+__device__ inline float __logf(float x) { return __hip_fast_logf(x); }

 __device__ inline float __powf(float base, float exponent) {
-  return __hip_fast_powf(base, exponent);
+    return __hip_fast_powf(base, exponent);
 }

 __device__ static inline float __saturatef(float x) {
-  x = x > 1.0f ? 1.0f : x;
-  x = x < 0.0f ? 0.0f : x;
-  return x;
+    x = x > 1.0f ? 1.0f : x;
+    x = x < 0.0f ? 0.0f : x;
+    return x;
 }

-__device__ inline void __sincosf(float x, float *s, float *c) {
-  return __hip_fast_sincosf(x, s, c);
+__device__ inline void __sincosf(float x, float* s, float* c) {
+    return __hip_fast_sincosf(x, s, c);
 }

-__device__ inline float __sinf(float x) {
-  return __hip_fast_sinf(x);
-}
+__device__ inline float __sinf(float x) { return __hip_fast_sinf(x); }

-__device__ inline float __tanf(float x) {
-  return __hip_fast_tanf(x);
-}
+__device__ inline float __tanf(float x) { return __hip_fast_tanf(x); }


 /*
 Double Precision Intrinsics
 */

-__device__ static inline double __dadd_rd(double x, double y) {
-  return x + y;
-}
+__device__ static inline double __dadd_rd(double x, double y) { return x + y; }

-__device__ static inline double __dadd_rn(double x, double y) {
-  return x + y;
-}
+__device__ static inline double __dadd_rn(double x, double y) { return x + y; }

-__device__ static inline double __dadd_ru(double x, double y) {
-  return x + y;
-}
+__device__ static inline double __dadd_ru(double x, double y) { return x + y; }

-__device__ static inline double __dadd_rz(double x, double y) {
-  return x + y;
-}
+__device__ static inline double __dadd_rz(double x, double y) { return x + y; }

-__device__ static inline double __ddiv_rd(double x, double y) {
-  return x / y;
-}
+__device__ static inline double __ddiv_rd(double x, double y) { return x / y; }

-__device__ static inline double __ddiv_rn(double x, double y) {
-  return x / y;
-}
+__device__ static inline double __ddiv_rn(double x, double y) { return x / y; }

-__device__ static inline double __ddiv_ru(double x, double y) {
-  return x / y;
-}
+__device__ static inline double __ddiv_ru(double x, double y) { return x / y; }

-__device__ static inline double __ddiv_rz(double x, double y) {
-  return x / y;
-}
+__device__ static inline double __ddiv_rz(double x, double y) { return x / y; }

-__device__ static inline double __dmul_rd(double x, double y) {
-  return x * y;
-}
+__device__ static inline double __dmul_rd(double x, double y) { return x * y; }

-__device__ static inline double __dmul_rn(double x, double y) {
-  return x * y;
-}
+__device__ static inline double __dmul_rn(double x, double y) { return x * y; }

-__device__ static inline double __dmul_ru(double x, double y) {
-  return x * y;
-}
+__device__ static inline double __dmul_ru(double x, double y) { return x * y; }

-__device__ static inline double __dmul_rz(double x, double y) {
-  return x * y;
-}
+__device__ static inline double __dmul_rz(double x, double y) { return x * y; }

-__device__ inline double __drcp_rd(double x) {
-  return __hip_fast_drcp(x);
-}
+__device__ inline double __drcp_rd(double x) { return __hip_fast_drcp(x); }

-__device__ inline double __drcp_rn(double x) {
-  return __hip_fast_drcp(x);
-}
+__device__ inline double __drcp_rn(double x) { return __hip_fast_drcp(x); }

-__device__ inline double __drcp_ru(double x) {
-  return __hip_fast_drcp(x);
-}
+__device__ inline double __drcp_ru(double x) { return __hip_fast_drcp(x); }

-__device__ inline double __drcp_rz(double x) {
-  return __hip_fast_drcp(x);
-}
+__device__ inline double __drcp_rz(double x) { return __hip_fast_drcp(x); }


-__device__ inline double __dsqrt_rd(double x) {
-  return __hip_fast_dsqrt(x);
-}
+__device__ inline double __dsqrt_rd(double x) { return __hip_fast_dsqrt(x); }

-__device__ inline double __dsqrt_rn(double x) {
-  return __hip_fast_dsqrt(x);
-}
+__device__ inline double __dsqrt_rn(double x) { return __hip_fast_dsqrt(x); }

-__device__ inline double __dsqrt_ru(double x) {
-  return __hip_fast_dsqrt(x);
-}
+__device__ inline double __dsqrt_ru(double x) { return __hip_fast_dsqrt(x); }

-__device__ inline double __dsqrt_rz(double x) {
-  return __hip_fast_dsqrt(x);
-}
+__device__ inline double __dsqrt_rz(double x) { return __hip_fast_dsqrt(x); }

-__device__ static inline double __dsub_rd(double x, double y) {
-  return x - y;
-}
+__device__ static inline double __dsub_rd(double x, double y) { return x - y; }

-__device__ static inline double __dsub_rn(double x, double y) {
-  return x - y;
-}
+__device__ static inline double __dsub_rn(double x, double y) { return x - y; }

-__device__ static inline double __dsub_ru(double x, double y) {
-  return x - y;
-}
+__device__ static inline double __dsub_ru(double x, double y) { return x - y; }

-__device__ static inline double __dsub_rz(double x, double y) {
-  return x - y;
-}
+__device__ static inline double __dsub_rz(double x, double y) { return x - y; }

-__device__ inline double __fma_rd(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
+__device__ inline double __fma_rd(double x, double y, double z) { return __hip_fast_fma(x, y, z); }

-__device__ inline double __fma_rn(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
+__device__ inline double __fma_rn(double x, double y, double z) { return __hip_fast_fma(x, y, z); }

-__device__ inline double __fma_ru(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
+__device__ inline double __fma_ru(double x, double y, double z) { return __hip_fast_fma(x, y, z); }

-__device__ inline double __fma_rz(double x, double y, double z) {
-  return __hip_fast_fma(x, y, z);
-}
+__device__ inline double __fma_rz(double x, double y, double z) { return __hip_fast_fma(x, y, z); }


 extern "C" unsigned int __hip_hc_ir_umul24_int(unsigned int, unsigned int);
@@ -425,8 +290,8 @@ extern "C" unsigned int __hip_hc_ir_umulhi_int(unsigned int, unsigned int);
 extern "C" unsigned int __hip_hc_ir_usad_int(unsigned int, unsigned int, unsigned int);

 // integer intrinsic function __poc __clz __ffs __brev
-__device__ unsigned int __brev( unsigned int x);
-__device__ unsigned long long int __brevll( unsigned long long int x);
+__device__ unsigned int __brev(unsigned int x);
+__device__ unsigned long long int __brevll(unsigned long long int x);
 __device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
 __device__ unsigned int __clz(int x);
 __device__ unsigned int __clzll(long long int x);
@@ -448,41 +313,36 @@ __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);

 __device__ static inline unsigned int __hadd(int x, int y) {
-  int z = x + y;
-  int sign = z & 0x8000000;
-  int value = z & 0x7FFFFFFF;
-  return ((value) >> 1 || sign);
-}
-__device__ static inline int __mul24(int x, int y) {
-  return __hip_hc_ir_mul24_int(x, y);
-}
-__device__ static inline int __mulhi(int x, int y) {
-  return __hip_hc_ir_mulhi_int(x, y);
+    int z = x + y;
+    int sign = z & 0x8000000;
+    int value = z & 0x7FFFFFFF;
+    return ((value) >> 1 || sign);
 }
+__device__ static inline int __mul24(int x, int y) { return __hip_hc_ir_mul24_int(x, y); }
+__device__ static inline int __mulhi(int x, int y) { return __hip_hc_ir_mulhi_int(x, y); }
 __device__ static inline int __rhadd(int x, int y) {
-  int z = x + y + 1;
-  int sign = z & 0x8000000;
-  int value = z & 0x7FFFFFFF;
-  return ((value) >> 1 || sign);
+    int z = x + y + 1;
+    int sign = z & 0x8000000;
+    int value = z & 0x7FFFFFFF;
+    return ((value) >> 1 || sign);
 }
 __device__ static inline unsigned int __sad(int x, int y, int z) {
-  return x > y ? x - y + z : y - x + z;
+    return x > y ? x - y + z : y - x + z;
 }
 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
-  return (x + y) >> 1;
+    return (x + y) >> 1;
 }
 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
-  return __hip_hc_ir_umul24_int(x, y);
+    return __hip_hc_ir_umul24_int(x, y);
 }
 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
-  return __hip_hc_ir_umulhi_int(x, y);
+    return __hip_hc_ir_umulhi_int(x, y);
 }
 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
-  return (x + y + 1) >> 1;
+    return (x + y + 1) >> 1;
 }
-__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z)
-{
-  return __hip_hc_ir_usad_int(x, y, z);
+__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
+    return __hip_hc_ir_usad_int(x, y, z);
 }

 /*
@@ -24,16 +24,14 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H

 typedef void* hipDeviceptr_t;
-enum hipChannelFormatKind
-{
+enum hipChannelFormatKind {
    hipChannelFormatKindSigned = 0,
    hipChannelFormatKindUnsigned = 1,
    hipChannelFormatKindFloat = 2,
    hipChannelFormatKindNone = 3
 };

-struct hipChannelFormatDesc
-{
+struct hipChannelFormatDesc {
    int x;
    int y;
    int z;
@@ -45,8 +43,7 @@ struct hipChannelFormatDesc
 #define HIP_TRSF_READ_AS_INTEGER 0x01
 #define HIP_TRSA_OVERRIDE_FORMAT 0x01

-enum hipArray_Format
-{
+enum hipArray_Format {
    HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
    HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
    HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
@@ -58,16 +55,16 @@ enum hipArray_Format
 };

 struct HIP_ARRAY_DESCRIPTOR {
-	enum hipArray_Format format;
-	unsigned int numChannels;
-	size_t width;
-	size_t height;
+    enum hipArray_Format format;
+    unsigned int numChannels;
+    size_t width;
+    size_t height;
    unsigned int flags;
    size_t depth;
 };

 struct hipArray {
-    void* data; //FIXME: generalize this
+    void* data;  // FIXME: generalize this
    struct hipChannelFormatDesc desc;
    unsigned int type;
    unsigned int width;
@@ -79,23 +76,23 @@ struct hipArray {
 };

 typedef struct hip_Memcpy2D {
-    size_t  height;
-    size_t  widthInBytes;
+    size_t height;
+    size_t widthInBytes;
    hipArray* dstArray;
    hipDeviceptr_t dstDevice;
-    void * dstHost;
+    void* dstHost;
    hipMemoryType dstMemoryType;
-    size_t  dstPitch;
-    size_t  dstXInBytes;
-    size_t  dstY;
+    size_t dstPitch;
+    size_t dstXInBytes;
+    size_t dstY;
    hipArray* srcArray;
    hipDeviceptr_t srcDevice;
-    const void * srcHost;
+    const void* srcHost;
    hipMemoryType srcMemoryType;
-    size_t  srcPitch;
-    size_t  srcXInBytes;
-    size_t  srcY;
-}hip_Memcpy2D;
+    size_t srcPitch;
+    size_t srcXInBytes;
+    size_t srcY;
+} hip_Memcpy2D;


 typedef struct hipArray* hipArray_t;
@@ -104,68 +101,66 @@ typedef const struct hipArray* hipArray_const_t;

 // TODO: It needs to be modified since it was just copied from hipArray.
 struct hipMipmappedArray {
-    void* data; //FIXME: generalize this
+    void* data;  // FIXME: generalize this
    struct hipChannelFormatDesc desc;
    unsigned int width;
    unsigned int height;
    unsigned int depth;
 };

-typedef struct hipMipmappedArray *hipMipmappedArray_t;
+typedef struct hipMipmappedArray* hipMipmappedArray_t;

-typedef const struct hipMipmappedArray *hipMipmappedArray_const_t;
+typedef const struct hipMipmappedArray* hipMipmappedArray_const_t;

 /**
 * hip resource types
 */
-enum hipResourceType
-{
-    hipResourceTypeArray          = 0x00,
+enum hipResourceType {
+    hipResourceTypeArray = 0x00,
    hipResourceTypeMipmappedArray = 0x01,
-    hipResourceTypeLinear         = 0x02,
-    hipResourceTypePitch2D        = 0x03
+    hipResourceTypeLinear = 0x02,
+    hipResourceTypePitch2D = 0x03
 };

 /**
 * hip texture resource view formats
 */
-enum hipResourceViewFormat
-{
-    hipResViewFormatNone                      = 0x00,
-    hipResViewFormatUnsignedChar1             = 0x01,
-    hipResViewFormatUnsignedChar2             = 0x02,
-    hipResViewFormatUnsignedChar4             = 0x03,
-    hipResViewFormatSignedChar1               = 0x04,
-    hipResViewFormatSignedChar2               = 0x05,
-    hipResViewFormatSignedChar4               = 0x06,
-    hipResViewFormatUnsignedShort1            = 0x07,
-    hipResViewFormatUnsignedShort2            = 0x08,
-    hipResViewFormatUnsignedShort4            = 0x09,
-    hipResViewFormatSignedShort1              = 0x0a,
-    hipResViewFormatSignedShort2              = 0x0b,
-    hipResViewFormatSignedShort4              = 0x0c,
-    hipResViewFormatUnsignedInt1              = 0x0d,
-    hipResViewFormatUnsignedInt2              = 0x0e,
-    hipResViewFormatUnsignedInt4              = 0x0f,
-    hipResViewFormatSignedInt1                = 0x10,
-    hipResViewFormatSignedInt2                = 0x11,
-    hipResViewFormatSignedInt4                = 0x12,
-    hipResViewFormatHalf1                     = 0x13,
-    hipResViewFormatHalf2                     = 0x14,
-    hipResViewFormatHalf4                     = 0x15,
-    hipResViewFormatFloat1                    = 0x16,
-    hipResViewFormatFloat2                    = 0x17,
-    hipResViewFormatFloat4                    = 0x18,
-    hipResViewFormatUnsignedBlockCompressed1  = 0x19,
-    hipResViewFormatUnsignedBlockCompressed2  = 0x1a,
-    hipResViewFormatUnsignedBlockCompressed3  = 0x1b,
-    hipResViewFormatUnsignedBlockCompressed4  = 0x1c,
-    hipResViewFormatSignedBlockCompressed4    = 0x1d,
-    hipResViewFormatUnsignedBlockCompressed5  = 0x1e,
-    hipResViewFormatSignedBlockCompressed5    = 0x1f,
+enum hipResourceViewFormat {
+    hipResViewFormatNone = 0x00,
+    hipResViewFormatUnsignedChar1 = 0x01,
+    hipResViewFormatUnsignedChar2 = 0x02,
+    hipResViewFormatUnsignedChar4 = 0x03,
+    hipResViewFormatSignedChar1 = 0x04,
+    hipResViewFormatSignedChar2 = 0x05,
+    hipResViewFormatSignedChar4 = 0x06,
+    hipResViewFormatUnsignedShort1 = 0x07,
+    hipResViewFormatUnsignedShort2 = 0x08,
+    hipResViewFormatUnsignedShort4 = 0x09,
+    hipResViewFormatSignedShort1 = 0x0a,
+    hipResViewFormatSignedShort2 = 0x0b,
+    hipResViewFormatSignedShort4 = 0x0c,
+    hipResViewFormatUnsignedInt1 = 0x0d,
+    hipResViewFormatUnsignedInt2 = 0x0e,
+    hipResViewFormatUnsignedInt4 = 0x0f,
+    hipResViewFormatSignedInt1 = 0x10,
+    hipResViewFormatSignedInt2 = 0x11,
+    hipResViewFormatSignedInt4 = 0x12,
+    hipResViewFormatHalf1 = 0x13,
+    hipResViewFormatHalf2 = 0x14,
+    hipResViewFormatHalf4 = 0x15,
+    hipResViewFormatFloat1 = 0x16,
+    hipResViewFormatFloat2 = 0x17,
+    hipResViewFormatFloat4 = 0x18,
+    hipResViewFormatUnsignedBlockCompressed1 = 0x19,
+    hipResViewFormatUnsignedBlockCompressed2 = 0x1a,
+    hipResViewFormatUnsignedBlockCompressed3 = 0x1b,
+    hipResViewFormatUnsignedBlockCompressed4 = 0x1c,
+    hipResViewFormatSignedBlockCompressed4 = 0x1d,
+    hipResViewFormatUnsignedBlockCompressed5 = 0x1e,
+    hipResViewFormatSignedBlockCompressed5 = 0x1f,
    hipResViewFormatUnsignedBlockCompressed6H = 0x20,
-    hipResViewFormatSignedBlockCompressed6H   = 0x21,
-    hipResViewFormatUnsignedBlockCompressed7  = 0x22
+    hipResViewFormatSignedBlockCompressed6H = 0x21,
+    hipResViewFormatUnsignedBlockCompressed7 = 0x22
 };

 /**
@@ -182,12 +177,12 @@ struct hipResourceDesc {
            hipMipmappedArray_t mipmap;
        } mipmap;
        struct {
-            void *devPtr;
+            void* devPtr;
            struct hipChannelFormatDesc desc;
            size_t sizeInBytes;
        } linear;
        struct {
-            void *devPtr;
+            void* devPtr;
            struct hipChannelFormatDesc desc;
            size_t width;
            size_t height;
@@ -199,16 +194,15 @@ struct hipResourceDesc {
 /**
 * hip resource view descriptor
 */
-struct hipResourceViewDesc
-{
+struct hipResourceViewDesc {
    enum hipResourceViewFormat format;
-    size_t                     width;
-    size_t                     height;
-    size_t                     depth;
-    unsigned int               firstMipmapLevel;
-    unsigned int               lastMipmapLevel;
-    unsigned int               firstLayer;
-    unsigned int               lastLayer;
+    size_t width;
+    size_t height;
+    size_t depth;
+    unsigned int firstMipmapLevel;
+    unsigned int lastMipmapLevel;
+    unsigned int firstLayer;
+    unsigned int lastLayer;
 };

 /**
@@ -216,23 +210,24 @@ struct hipResourceViewDesc
 *
 */
 typedef enum hipMemcpyKind {
-    hipMemcpyHostToHost = 0,	///< Host-to-Host Copy
-    hipMemcpyHostToDevice = 1,  ///< Host-to-Device Copy
-    hipMemcpyDeviceToHost = 2,  ///< Device-to-Host Copy
-    hipMemcpyDeviceToDevice =3, ///< Device-to-Device Copy
-    hipMemcpyDefault = 4      	///< Runtime will automatically determine copy-kind based on virtual addresses.
+    hipMemcpyHostToHost = 0,      ///< Host-to-Host Copy
+    hipMemcpyHostToDevice = 1,    ///< Host-to-Device Copy
+    hipMemcpyDeviceToHost = 2,    ///< Device-to-Host Copy
+    hipMemcpyDeviceToDevice = 3,  ///< Device-to-Device Copy
+    hipMemcpyDefault =
+        4  ///< Runtime will automatically determine copy-kind based on virtual addresses.
 } hipMemcpyKind;

-struct hipPitchedPtr
-{
-    void   *ptr;
-    size_t  pitch;
-    size_t  xsize;
-    size_t  ysize;
+struct hipPitchedPtr {
+    void* ptr;
+    size_t pitch;
+    size_t xsize;
+    size_t ysize;
 };

 struct hipExtent {
-    size_t width;     // Width in elements when referring to array memory, in bytes when referring to linear memory
+    size_t width;  // Width in elements when referring to array memory, in bytes when referring to
+                   // linear memory
    size_t height;
    size_t depth;
 };
@@ -244,74 +239,72 @@ struct hipPos {
 };

 struct hipMemcpy3DParms {
-    hipArray_t            srcArray;
-    struct hipPos         srcPos;
-    struct hipPitchedPtr  srcPtr;
+    hipArray_t srcArray;
+    struct hipPos srcPos;
+    struct hipPitchedPtr srcPtr;

-    hipArray_t            dstArray;
-    struct hipPos         dstPos;
-    struct hipPitchedPtr  dstPtr;
+    hipArray_t dstArray;
+    struct hipPos dstPos;
+    struct hipPitchedPtr dstPtr;

-    struct hipExtent      extent;
-    enum hipMemcpyKind    kind;
-    
-    size_t                Depth;
-    size_t                Height;
-    size_t                WidthInBytes;
-    hipDeviceptr_t        dstDevice;
-    size_t                dstHeight;
-    void *                dstHost;
-    size_t                dstLOD;
-    hipMemoryType         dstMemoryType;
-    size_t                dstPitch;
-    size_t                dstXInBytes;
-    size_t                dstY;
-    size_t                dstZ;
-    void *                reserved0;
-    void *                reserved1;
-    hipDeviceptr_t        srcDevice;
-    size_t                srcHeight;
-    const void *          srcHost;
-    size_t                srcLOD;
-    hipMemoryType          srcMemoryType;
-    size_t                srcPitch;
-    size_t                srcXInBytes;
-    size_t                srcY;
-    size_t                srcZ;
+    struct hipExtent extent;
+    enum hipMemcpyKind kind;
+
+    size_t Depth;
+    size_t Height;
+    size_t WidthInBytes;
+    hipDeviceptr_t dstDevice;
+    size_t dstHeight;
+    void* dstHost;
+    size_t dstLOD;
+    hipMemoryType dstMemoryType;
+    size_t dstPitch;
+    size_t dstXInBytes;
+    size_t dstY;
+    size_t dstZ;
+    void* reserved0;
+    void* reserved1;
+    hipDeviceptr_t srcDevice;
+    size_t srcHeight;
+    const void* srcHost;
+    size_t srcLOD;
+    hipMemoryType srcMemoryType;
+    size_t srcPitch;
+    size_t srcXInBytes;
+    size_t srcY;
+    size_t srcZ;
 };

-static __inline__ struct hipPitchedPtr make_hipPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
-{
-	struct hipPitchedPtr s;
+static __inline__ struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
+                                                          size_t ysz) {
+    struct hipPitchedPtr s;

-	s.ptr   = d;
-	s.pitch = p;
-	s.xsize = xsz;
-	s.ysize = ysz;
+    s.ptr = d;
+    s.pitch = p;
+    s.xsize = xsz;
+    s.ysize = ysz;

-	return s;
+    return s;
 }

-static __inline__ struct hipPos make_hipPos(size_t x, size_t y, size_t z)
-{
-	struct hipPos p;
+static __inline__ struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
+    struct hipPos p;

-	p.x = x;
-	p.y = y;
-	p.z = z;
+    p.x = x;
+    p.y = y;
+    p.z = z;

-	return p;
+    return p;
 }

-static __inline__ struct hipExtent make_hipExtent(size_t w, size_t h, size_t d)
-{
-	struct hipExtent e;
+static __inline__ struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
+    struct hipExtent e;

-	e.width  = w;
-	e.height = h;
-	e.depth  = d;
+    e.width = w;
+    e.height = h;
+    e.depth = d;

-	return e;
+    return e;
 }

 #endif
@@ -44,116 +44,60 @@ THE SOFTWARE.
 #include <utility>
 #include <vector>

-namespace hip_impl
-{
-    template<
-        typename T,
-        typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
-    inline
-    T round_up_to_next_multiple_nonnegative(T x, T y)
-    {
-        T tmp = x + y - 1;
-        return tmp - tmp % y;
-    }
+namespace hip_impl {
+template <typename T, typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
+inline T round_up_to_next_multiple_nonnegative(T x, T y) {
+    T tmp = x + y - 1;
+    return tmp - tmp % y;
+}

-    inline
-    std::vector<std::uint8_t> make_kernarg()
-    {
-        return {};
-    }
+inline std::vector<std::uint8_t> make_kernarg() { return {}; }

-    inline
-    std::vector<std::uint8_t> make_kernarg(std::vector<std::uint8_t> kernarg)
-    {
-        return kernarg;
-    }
+inline std::vector<std::uint8_t> make_kernarg(std::vector<std::uint8_t> kernarg) { return kernarg; }

-    template<typename T>
-    inline
-    std::vector<std::uint8_t> make_kernarg(std::vector<uint8_t> kernarg, T x)
-    {
-        kernarg.resize(
-            round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) +
-            sizeof(T));
+template <typename T>
+inline std::vector<std::uint8_t> make_kernarg(std::vector<uint8_t> kernarg, T x) {
+    kernarg.resize(round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) + sizeof(T));

-        new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)};
+    new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)};

-        return kernarg;
-    }
+    return kernarg;
+}

-    template<typename T, typename... Ts>
-    inline
-    std::vector<std::uint8_t> make_kernarg(
-        std::vector<std::uint8_t> kernarg, T x, Ts... xs)
-    {
-        return make_kernarg(
-            make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...);
-    }
+template <typename T, typename... Ts>
+inline std::vector<std::uint8_t> make_kernarg(std::vector<std::uint8_t> kernarg, T x, Ts... xs) {
+    return make_kernarg(make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...);
+}

-    template<typename... Ts>
-    inline
-    std::vector<std::uint8_t> make_kernarg(Ts... xs)
-    {
-        std::vector<std::uint8_t> kernarg;
-        kernarg.reserve(sizeof(std::tuple<Ts...>));
+template <typename... Ts>
+inline std::vector<std::uint8_t> make_kernarg(Ts... xs) {
+    std::vector<std::uint8_t> kernarg;
+    kernarg.reserve(sizeof(std::tuple<Ts...>));

-        return make_kernarg(std::move(kernarg), std::move(xs)...);
-    }
+    return make_kernarg(std::move(kernarg), std::move(xs)...);
+}

-    void hipLaunchKernelGGLImpl(
-        std::uintptr_t function_address,
-        const dim3& numBlocks,
-        const dim3& dimBlocks,
-        std::uint32_t sharedMemBytes,
-        hipStream_t stream,
-        void** kernarg);
-} // Namespace hip_impl.
+void hipLaunchKernelGGLImpl(std::uintptr_t function_address, const dim3& numBlocks,
+                            const dim3& dimBlocks, std::uint32_t sharedMemBytes, hipStream_t stream,
+                            void** kernarg);
+}  // Namespace hip_impl.

-template<typename... Args, typename F = void (*)(Args...)>
-inline
-void hipLaunchKernelGGL(
-    F kernel,
-    const dim3& numBlocks,
-    const dim3& dimBlocks,
-    std::uint32_t sharedMemBytes,
-    hipStream_t stream,
-    Args... args)
-{
+template <typename... Args, typename F = void (*)(Args...)>
+inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                               std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
    auto kernarg = hip_impl::make_kernarg(std::move(args)...);
    std::size_t kernarg_size = kernarg.size();

-    void* config[] = {
-      HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),
-      HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size,
-      HIP_LAUNCH_PARAM_END
-    };
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                      &kernarg_size, HIP_LAUNCH_PARAM_END};

-    hip_impl::hipLaunchKernelGGLImpl(
-        reinterpret_cast<std::uintptr_t>(kernel),
-        numBlocks,
-        dimBlocks,
-        sharedMemBytes,
-        stream,
-        &config[0]);
+    hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel), numBlocks, dimBlocks,
+                                     sharedMemBytes, stream, &config[0]);
 }

-template<typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
-inline
-void hipLaunchKernel(
-    F kernel,
-    const dim3& numBlocks,
-    const dim3& dimBlocks,
-    std::uint32_t groupMemBytes,
-    hipStream_t stream,
-    Args... args)
-{
-    hipLaunchKernelGGL(
-        kernel,
-        numBlocks,
-        dimBlocks,
-        groupMemBytes,
-        stream,
-        hipLaunchParm{},
-        std::move(args)...);
+template <typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
+inline void hipLaunchKernel(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                            std::uint32_t groupMemBytes, hipStream_t stream, Args... args) {
+    hipLaunchKernelGGL(kernel, numBlocks, dimBlocks, groupMemBytes, stream, hipLaunchParm{},
+                       std::move(args)...);
 }
-
@@ -22,9 +22,9 @@ THE SOFTWARE.
 #pragma once

 #if GENERIC_GRID_LAUNCH == 1
-    #if __hcc_workweek__ >= 17481
-        #include "functional_grid_launch.hpp"
-    #else
-        #include "macro_based_grid_launch.hpp"
-    #endif
-#endif //GENERIC_GRID_LAUNCH
+#if __hcc_workweek__ >= 17481
+#include "functional_grid_launch.hpp"
+#else
+#include "macro_based_grid_launch.hpp"
+#endif
+#endif  // GENERIC_GRID_LAUNCH
@@ -23,111 +23,88 @@ THE SOFTWARE.
 #pragma once
 #include "concepts.hpp"

-#include <type_traits> // For std::conditional, std::decay, std::enable_if,
-                       // std::false_type, std result_of and std::true_type.
-#include <utility>     // For std::declval.
+#include <type_traits>  // For std::conditional, std::decay, std::enable_if,
+                        // std::false_type, std result_of and std::true_type.
+#include <utility>      // For std::declval.

-namespace std
-{   // TODO: these should be removed as soon as possible.
-    #if (__cplusplus < 201406L)
-        #if (__cplusplus < 201402L)
-            template<bool cond, typename T = void>
-            using enable_if_t = typename enable_if<cond, T>::type;
-            template<bool cond, typename T, typename U>
-            using conditional_t = typename conditional<cond, T, U>::type;
-            template<typename T>
-            using decay_t = typename decay<T>::type;
-            template<FunctionalProcedure F, typename... Ts>
-            using result_of_t = typename result_of<F(Ts...)>::type;
-            template<typename T>
-            using remove_reference_t = typename remove_reference<T>::type;
-        #endif
-    #endif
-}
+namespace std {  // TODO: these should be removed as soon as possible.
+#if (__cplusplus < 201406L)
+#if (__cplusplus < 201402L)
+template <bool cond, typename T = void>
+using enable_if_t = typename enable_if<cond, T>::type;
+template <bool cond, typename T, typename U>
+using conditional_t = typename conditional<cond, T, U>::type;
+template <typename T>
+using decay_t = typename decay<T>::type;
+template <FunctionalProcedure F, typename... Ts>
+using result_of_t = typename result_of<F(Ts...)>::type;
+template <typename T>
+using remove_reference_t = typename remove_reference<T>::type;
+#endif
+#endif
+}  // namespace std

-namespace hip_impl
-{
-    template<typename...>
-    using void_t_ = void;
+namespace hip_impl {
+template <typename...>
+using void_t_ = void;

-    #if (__cplusplus < 201402L)
-        template<
-            FunctionalProcedure F,
-            unsigned int n = 0u,
-            typename = void>
-        struct is_callable_impl : is_callable_impl<F, n + 1u> {};
+#if (__cplusplus < 201402L)
+template <FunctionalProcedure F, unsigned int n = 0u, typename = void>
+struct is_callable_impl : is_callable_impl<F, n + 1u> {};

-        // Pointer to member function, call through non-pointer.
-        template<FunctionalProcedure F, typename C, typename... Ts>
-        struct is_callable_impl<
-            F(C, Ts...),
-            0u,
-            void_t_<decltype((std::declval<C>().*std::declval<F>())(
-                std::declval<Ts>()...))>
-        > : std::true_type {};
+// Pointer to member function, call through non-pointer.
+template <FunctionalProcedure F, typename C, typename... Ts>
+struct is_callable_impl<
+    F(C, Ts...), 0u,
+    void_t_<decltype((std::declval<C>().*std::declval<F>())(std::declval<Ts>()...))> >
+    : std::true_type {};

-        // Pointer to member function, call through pointer.
-        template<FunctionalProcedure F, typename C, typename... Ts>
-        struct is_callable_impl<
-            F(C, Ts...),
-            1u,
-            void_t_<decltype(((*std::declval<C>()).*std::declval<F>())(
-                std::declval<Ts>()...))>
-        > : std::true_type {};
+// Pointer to member function, call through pointer.
+template <FunctionalProcedure F, typename C, typename... Ts>
+struct is_callable_impl<
+    F(C, Ts...), 1u,
+    void_t_<decltype(((*std::declval<C>()).*std::declval<F>())(std::declval<Ts>()...))> >
+    : std::true_type {};

-        // Pointer to member data, call through non-pointer, no args.
-        template<FunctionalProcedure F, typename C>
-        struct is_callable_impl<
-            F(C),
-            2u,
-            void_t_<decltype(std::declval<C>().*std::declval<F>())>
-        > : std::true_type {};
+// Pointer to member data, call through non-pointer, no args.
+template <FunctionalProcedure F, typename C>
+struct is_callable_impl<F(C), 2u, void_t_<decltype(std::declval<C>().*std::declval<F>())> >
+    : std::true_type {};

-        // Pointer to member data, call through pointer, no args.
-        template<FunctionalProcedure F, typename C>
-        struct is_callable_impl<
-            F(C),
-            3u,
-            void_t_<decltype(*std::declval<C>().*std::declval<F>())>
-        > : std::true_type {};
+// Pointer to member data, call through pointer, no args.
+template <FunctionalProcedure F, typename C>
+struct is_callable_impl<F(C), 3u, void_t_<decltype(*std::declval<C>().*std::declval<F>())> >
+    : std::true_type {};

-        // General call, n args.
-        template<FunctionalProcedure F, typename... Ts>
-        struct is_callable_impl<
-            F(Ts...),
-            4u,
-            void_t_<decltype(std::declval<F>()(std::declval<Ts>()...))>
-        > : std::true_type {};
+// General call, n args.
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), 4u, void_t_<decltype(std::declval<F>()(std::declval<Ts>()...))> >
+    : std::true_type {};

-        // Not callable.
-        template<FunctionalProcedure F>
-        struct is_callable_impl<F, 5u> : std::false_type {};
-    #else
-        template<typename, typename = void>
-        struct is_callable_impl : std::false_type {};
+// Not callable.
+template <FunctionalProcedure F>
+struct is_callable_impl<F, 5u> : std::false_type {};
+#else
+template <typename, typename = void>
+struct is_callable_impl : std::false_type {};

-        template<FunctionalProcedure F, typename... Ts>
-        struct is_callable_impl<
-            F(Ts...),
-            void_t_<std::result_of_t<F(Ts...)>>> : std::true_type {};
-    #endif
-        template<typename Call>
-        struct is_callable : is_callable_impl<Call> {};
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), void_t_<std::result_of_t<F(Ts...)> > > : std::true_type {};
+#endif
+template <typename Call>
+struct is_callable : is_callable_impl<Call> {};

-    #define count_macro_args_impl_hip_(\
-         _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\
-         _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29,\
-         _30, _31, _n, ...)\
-         _n
-    #define count_macro_args_hip_(...)\
-        count_macro_args_impl_hip_(\
-            , ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,\
-            18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define count_macro_args_impl_hip_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,     \
+                                   _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25,     \
+                                   _26, _27, _28, _29, _30, _31, _n, ...)                          \
+    _n
+#define count_macro_args_hip_(...)                                                                 \
+    count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20,    \
+                               19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,  \
+                               0)

-    #define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
-    #define overload_macro_impl_hip_(macro, arg_cnt)\
-        overloaded_macro_expand_hip_(macro, arg_cnt)
-    #define overload_macro_hip_(macro, ...)\
-        overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))\
-            (__VA_ARGS__)
-}
+#define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
+#define overload_macro_impl_hip_(macro, arg_cnt) overloaded_macro_expand_hip_(macro, arg_cnt)
+#define overload_macro_hip_(macro, ...)                                                            \
+    overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))(__VA_ARGS__)
+}  // namespace hip_impl
@@ -27,79 +27,79 @@ THE SOFTWARE.
 #include <math.h>

 #if __cplusplus
-#define COMPLEX_ADD_OP_OVERLOAD(type) \
-__device__ __host__ static inline type operator + (const type& lhs, const type& rhs) { \
-  type ret; \
-  ret.x = lhs.x + rhs.x ; \
-  ret.y = lhs.y + rhs.y ; \
-  return ret; \
-}
+#define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
+    __device__ __host__ static inline type operator+(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x + rhs.x;                                                                     \
+        ret.y = lhs.y + rhs.y;                                                                     \
+        return ret;                                                                                \
+    }

-#define COMPLEX_SUB_OP_OVERLOAD(type) \
-__device__ __host__ static inline type operator - (const type& lhs, const type& rhs) { \
-  type ret; \
-  ret.x = lhs.x - rhs.x; \
-  ret.y = lhs.y - rhs.y; \
-  return ret; \
-}
+#define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
+    __device__ __host__ static inline type operator-(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x - rhs.x;                                                                     \
+        ret.y = lhs.y - rhs.y;                                                                     \
+        return ret;                                                                                \
+    }

-#define COMPLEX_MUL_OP_OVERLOAD(type) \
-__device__ __host__ static inline type operator * (const type& lhs, const type& rhs) { \
-  type ret; \
-  ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
-  ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
-  return ret; \
-}
+#define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
+    __device__ __host__ static inline type operator*(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
+        ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
+        return ret;                                                                                \
+    }

-#define COMPLEX_DIV_OP_OVERLOAD(type) \
-__device__ __host__ static inline type operator / (const type& lhs, const type& rhs) { \
-  type ret; \
-  ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
-  ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
-  ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y); \
-  ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y); \
-  return ret; \
-}
+#define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
+    __device__ __host__ static inline type operator/(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
+        ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
+        ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
+        ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
+        return ret;                                                                                \
+    }

-#define COMPLEX_ADD_PREOP_OVERLOAD(type) \
-__device__ __host__ static inline type& operator += (type& lhs, const type& rhs) { \
-  lhs.x += rhs.x; \
-  lhs.y += rhs.y; \
-  return lhs; \
-}
+#define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
+    __device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) {               \
+        lhs.x += rhs.x;                                                                            \
+        lhs.y += rhs.y;                                                                            \
+        return lhs;                                                                                \
+    }

-#define COMPLEX_SUB_PREOP_OVERLOAD(type) \
-__device__ __host__ static inline type& operator -= (type& lhs, const type& rhs) { \
-  lhs.x -= rhs.x; \
-  lhs.y -= rhs.y; \
-  return lhs; \
-}
+#define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
+    __device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) {               \
+        lhs.x -= rhs.x;                                                                            \
+        lhs.y -= rhs.y;                                                                            \
+        return lhs;                                                                                \
+    }

-#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
-__device__ __host__ static inline type& operator *= (type& lhs, const type& rhs) { \
-  lhs = lhs * rhs; \
-  return lhs; \
-}
+#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
+    __device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) {               \
+        lhs = lhs * rhs;                                                                           \
+        return lhs;                                                                                \
+    }

-#define COMPLEX_DIV_PREOP_OVERLOAD(type) \
-__device__ __host__ static inline type& operator /= (type& lhs, const type& rhs) { \
-  lhs = lhs / rhs; \
-  return lhs; \
-}
+#define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
+    __device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) {               \
+        lhs = lhs / rhs;                                                                           \
+        return lhs;                                                                                \
+    }

-#define COMPLEX_SCALAR_PRODUCT(type, type1) \
-__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \
-  type ret; \
-  ret.x = lhs.x * rhs; \
-  ret.y = lhs.y * rhs; \
-  return ret; \
-}
+#define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
+    __device__ __host__ static inline type operator*(const type& lhs, type1 rhs) {                 \
+        type ret;                                                                                  \
+        ret.x = lhs.x * rhs;                                                                       \
+        ret.y = lhs.y * rhs;                                                                       \
+        return ret;                                                                                \
+    }

 #endif

 struct hipFloatComplex {
-  #ifdef __cplusplus
-    public:
+#ifdef __cplusplus
+   public:
    __device__ __host__ hipFloatComplex() : x(0.0f), y(0.0f) {}
    __device__ __host__ hipFloatComplex(float x) : x(x), y(0.0f) {}
    __device__ __host__ hipFloatComplex(float x, float y) : x(x), y(y) {}
@@ -112,27 +112,27 @@ struct hipFloatComplex {
    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long)
    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, unsigned long long)
    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipFloatComplex, signed long long)
-  #endif
-  float x, y;
+#endif
+    float x, y;
 } __attribute__((aligned(8)));

 struct hipDoubleComplex {
-  #ifdef __cplusplus
-    public:
-      __device__ __host__ hipDoubleComplex() : x(0.0f), y(0.0f) {}
-      __device__ __host__ hipDoubleComplex(double x) : x(x), y(0.0f) {}
-      __device__ __host__ hipDoubleComplex(double x, double y) : x(x), y(y) {}
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned short)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed short)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned int)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed int)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, float)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long long)
-      MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long long)
-  #endif
-  double x, y;
+#ifdef __cplusplus
+   public:
+    __device__ __host__ hipDoubleComplex() : x(0.0f), y(0.0f) {}
+    __device__ __host__ hipDoubleComplex(double x) : x(x), y(0.0f) {}
+    __device__ __host__ hipDoubleComplex(double x, double y) : x(x), y(y) {}
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(hipDoubleComplex, signed long long)
+#endif
+    double x, y;
 } __attribute__((aligned(16)));

 #if __cplusplus
@@ -177,126 +177,110 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)

 #endif

-__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
-    return z.x;
-}
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }

-__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
-    return z.y;
-}
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }

-__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
    hipFloatComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
    hipFloatComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
 }

-__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
    return z.x * z.x + z.y * z.y;
 }

-__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
    float sqabs = hipCsqabsf(q);
    hipFloatComplex ret;
-    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
-    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
    return ret;
 }

-__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
-    return sqrtf(hipCsqabsf(z));
-}
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }


+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }

-__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
-    return z.x;
-}
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }

-__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
-    return z.y;
-}
-
-__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
    hipDoubleComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
    hipDoubleComplex ret;
    ret.x = z.x;
    ret.y = z.y;
    return ret;
 }

-__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
    return z.x * z.x + z.y * z.y;
 }

-__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
    double sqabs = hipCsqabs(q);
    hipDoubleComplex ret;
-    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
-    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
    return ret;
 }

-__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
-    return sqrtf(hipCsqabs(z));
-}
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }

 typedef hipFloatComplex hipComplex;

-__device__ __host__ static inline hipComplex make_hipComplex(float x,
-                                            float y){
+__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
    return make_hipFloatComplex(x, y);
 }

-__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat
-(hipDoubleComplex z){
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
    return make_hipFloatComplex((float)z.x, (float)z.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble
-(hipFloatComplex z){
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
    return make_hipDoubleComplex((double)z.x, (double)z.y);
 }

-__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
    float real = (p.x * q.x) + r.x;
    float imag = (q.x * p.y) + r.y;

@@ -306,7 +290,8 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q
    return make_hipComplex(real, imag);
 }

-__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
    float real = (p.x * q.x) + r.x;
    float imag = (q.x * p.y) + r.y;

@@ -10,10 +10,9 @@
 * HIP maintains a table for all memory allocations performed by the application.
 * If targetAddress is 0, the entire table is printed to stderr.
 * If targetAddress is non-null, this routine will perform some forensic analysis
- * to find the pointer 
+ * to find the pointer
 */
-void hipdbPrintMem(void *targetAddress);
-
+void hipdbPrintMem(void* targetAddress);


 // doxygen end HipDb
@@ -24,7 +24,7 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H

 #include "hip/hcc_detail/hip_vector_types.h"
-#if ( __clang_major__ > 3)
+#if (__clang_major__ > 3)
 typedef __fp16 __half;
 typedef __fp16 __half1 __attribute__((ext_vector_type(1)));
 typedef __fp16 __half2 __attribute__((ext_vector_type(2)));
@@ -63,32 +63,32 @@ __device__ static __half2 h2div(__half2 a, __half2 b);
 Half Comparision Functions
 */

-__device__  bool __heq(__half a, __half b);
-__device__  bool __hge(__half a, __half b);
-__device__  bool __hgt(__half a, __half b);
-__device__  bool __hisinf(__half a);
-__device__  bool __hisnan(__half a);
-__device__  bool __hle(__half a, __half b);
-__device__  bool __hlt(__half a, __half b);
-__device__  bool __hne(__half a, __half b);
+__device__ bool __heq(__half a, __half b);
+__device__ bool __hge(__half a, __half b);
+__device__ bool __hgt(__half a, __half b);
+__device__ bool __hisinf(__half a);
+__device__ bool __hisnan(__half a);
+__device__ bool __hle(__half a, __half b);
+__device__ bool __hlt(__half a, __half b);
+__device__ bool __hne(__half a, __half b);

 /*
 Half2 Comparision Functions
 */

-__device__  bool __hbeq2(__half2 a, __half2 b);
-__device__  bool __hbge2(__half2 a, __half2 b);
-__device__  bool __hbgt2(__half2 a, __half2 b);
-__device__  bool __hble2(__half2 a, __half2 b);
-__device__  bool __hblt2(__half2 a, __half2 b);
-__device__  bool __hbne2(__half2 a, __half2 b);
-__device__  __half2 __heq2(__half2 a, __half2 b);
-__device__  __half2 __hge2(__half2 a, __half2 b);
-__device__  __half2 __hgt2(__half2 a, __half2 b);
-__device__  __half2 __hisnan2(__half2 a);
-__device__  __half2 __hle2(__half2 a, __half2 b);
-__device__  __half2 __hlt2(__half2 a, __half2 b);
-__device__  __half2 __hne2(__half2 a, __half2 b);
+__device__ bool __hbeq2(__half2 a, __half2 b);
+__device__ bool __hbge2(__half2 a, __half2 b);
+__device__ bool __hbgt2(__half2 a, __half2 b);
+__device__ bool __hble2(__half2 a, __half2 b);
+__device__ bool __hblt2(__half2 a, __half2 b);
+__device__ bool __hbne2(__half2 a, __half2 b);
+__device__ __half2 __heq2(__half2 a, __half2 b);
+__device__ __half2 __hge2(__half2 a, __half2 b);
+__device__ __half2 __hgt2(__half2 a, __half2 b);
+__device__ __half2 __hisnan2(__half2 a);
+__device__ __half2 __hle2(__half2 a, __half2 b);
+__device__ __half2 __hlt2(__half2 a, __half2 b);
+__device__ __half2 __hne2(__half2 a, __half2 b);

 /*
 Half Math Functions
@@ -130,80 +130,80 @@ __device__ static __half2 h2sqrt(const __half2 h);
 Half Conversion And Data Movement
 */

-__device__  __half2 __float22half2_rn(const float2 a);
-__device__  __half __float2half(const float a);
-__device__  __half2 __float2half2_rn(const float a);
-__device__  __half __float2half_rd(const float a);
-__device__  __half __float2half_rn(const float a);
-__device__  __half __float2half_ru(const float a);
-__device__  __half __float2half_rz(const float a);
-__device__  __half2 __floats2half2_rn(const float a, const float b);
-__device__  float2 __half22float2(const __half2 a);
-__device__  float __half2float(const __half a);
-__device__  __half2 half2half2(const __half a);
-__device__  int __half2int_rd(__half h);
-__device__  int __half2int_rn(__half h);
-__device__  int __half2int_ru(__half h);
-__device__  int __half2int_rz(__half h);
-__device__  long long int __half2ll_rd(__half h);
-__device__  long long int __half2ll_rn(__half h);
-__device__  long long int __half2ll_ru(__half h);
-__device__  long long int __half2ll_rz(__half h);
-__device__  short __half2short_rd(__half h);
-__device__  short __half2short_rn(__half h);
-__device__  short __half2short_ru(__half h);
-__device__  short __half2short_rz(__half h);
-__device__  unsigned int __half2uint_rd(__half h);
-__device__  unsigned int __half2uint_rn(__half h);
-__device__  unsigned int __half2uint_ru(__half h);
-__device__  unsigned int __half2uint_rz(__half h);
-__device__  unsigned long long int __half2ull_rd(__half h);
-__device__  unsigned long long int __half2ull_rn(__half h);
-__device__  unsigned long long int __half2ull_ru(__half h);
-__device__  unsigned long long int __half2ull_rz(__half h);
-__device__  unsigned short int __half2ushort_rd(__half h);
-__device__  unsigned short int __half2ushort_rn(__half h);
-__device__  unsigned short int __half2ushort_ru(__half h);
-__device__  unsigned short int __half2ushort_rz(__half h);
-__device__  short int __half_as_short(const __half h);
-__device__  unsigned short int __half_as_ushort(const __half h);
-__device__  __half2 __halves2half2(const __half a, const __half b);
-__device__  float __high2float(const __half2 a);
-__device__  __half __high2half(const __half2 a);
-__device__  __half2 __high2half2(const __half2 a);
-__device__  __half2 __highs2half2(const __half2 a, const __half2 b);
-__device__  __half __int2half_rd(int i);
-__device__  __half __int2half_rn(int i);
-__device__  __half __int2half_ru(int i);
-__device__  __half __int2half_rz(int i);
-__device__  __half __ll2half_rd(long long int i);
-__device__  __half __ll2half_rn(long long int i);
-__device__  __half __ll2half_ru(long long int i);
-__device__  __half __ll2half_rz(long long int i);
-__device__  float __low2float(const __half2 a);
+__device__ __half2 __float22half2_rn(const float2 a);
+__device__ __half __float2half(const float a);
+__device__ __half2 __float2half2_rn(const float a);
+__device__ __half __float2half_rd(const float a);
+__device__ __half __float2half_rn(const float a);
+__device__ __half __float2half_ru(const float a);
+__device__ __half __float2half_rz(const float a);
+__device__ __half2 __floats2half2_rn(const float a, const float b);
+__device__ float2 __half22float2(const __half2 a);
+__device__ float __half2float(const __half a);
+__device__ __half2 half2half2(const __half a);
+__device__ int __half2int_rd(__half h);
+__device__ int __half2int_rn(__half h);
+__device__ int __half2int_ru(__half h);
+__device__ int __half2int_rz(__half h);
+__device__ long long int __half2ll_rd(__half h);
+__device__ long long int __half2ll_rn(__half h);
+__device__ long long int __half2ll_ru(__half h);
+__device__ long long int __half2ll_rz(__half h);
+__device__ short __half2short_rd(__half h);
+__device__ short __half2short_rn(__half h);
+__device__ short __half2short_ru(__half h);
+__device__ short __half2short_rz(__half h);
+__device__ unsigned int __half2uint_rd(__half h);
+__device__ unsigned int __half2uint_rn(__half h);
+__device__ unsigned int __half2uint_ru(__half h);
+__device__ unsigned int __half2uint_rz(__half h);
+__device__ unsigned long long int __half2ull_rd(__half h);
+__device__ unsigned long long int __half2ull_rn(__half h);
+__device__ unsigned long long int __half2ull_ru(__half h);
+__device__ unsigned long long int __half2ull_rz(__half h);
+__device__ unsigned short int __half2ushort_rd(__half h);
+__device__ unsigned short int __half2ushort_rn(__half h);
+__device__ unsigned short int __half2ushort_ru(__half h);
+__device__ unsigned short int __half2ushort_rz(__half h);
+__device__ short int __half_as_short(const __half h);
+__device__ unsigned short int __half_as_ushort(const __half h);
+__device__ __half2 __halves2half2(const __half a, const __half b);
+__device__ float __high2float(const __half2 a);
+__device__ __half __high2half(const __half2 a);
+__device__ __half2 __high2half2(const __half2 a);
+__device__ __half2 __highs2half2(const __half2 a, const __half2 b);
+__device__ __half __int2half_rd(int i);
+__device__ __half __int2half_rn(int i);
+__device__ __half __int2half_ru(int i);
+__device__ __half __int2half_rz(int i);
+__device__ __half __ll2half_rd(long long int i);
+__device__ __half __ll2half_rn(long long int i);
+__device__ __half __ll2half_ru(long long int i);
+__device__ __half __ll2half_rz(long long int i);
+__device__ float __low2float(const __half2 a);

 __device__ __half __low2half(const __half2 a);
 __device__ __half2 __low2half2(const __half2 a, const __half2 b);
 __device__ __half2 __low2half2(const __half2 a);
 __device__ __half2 __lowhigh2highlow(const __half2 a);
 __device__ __half2 __lows2half2(const __half2 a, const __half2 b);
-__device__  __half __short2half_rd(short int i);
-__device__  __half __short2half_rn(short int i);
-__device__  __half __short2half_ru(short int i);
-__device__  __half __short2half_rz(short int i);
-__device__  __half __uint2half_rd(unsigned int i);
-__device__  __half __uint2half_rn(unsigned int i);
-__device__  __half __uint2half_ru(unsigned int i);
-__device__  __half __uint2half_rz(unsigned int i);
-__device__  __half __ull2half_rd(unsigned long long int i);
-__device__  __half __ull2half_rn(unsigned long long int i);
-__device__  __half __ull2half_ru(unsigned long long int i);
-__device__  __half __ull2half_rz(unsigned long long int i);
-__device__  __half __ushort2half_rd(unsigned short int i);
-__device__  __half __ushort2half_rn(unsigned short int i);
-__device__  __half __ushort2half_ru(unsigned short int i);
-__device__  __half __ushort2half_rz(unsigned short int i);
-__device__  __half __ushort_as_half(const unsigned short int i);
+__device__ __half __short2half_rd(short int i);
+__device__ __half __short2half_rn(short int i);
+__device__ __half __short2half_ru(short int i);
+__device__ __half __short2half_rz(short int i);
+__device__ __half __uint2half_rd(unsigned int i);
+__device__ __half __uint2half_rn(unsigned int i);
+__device__ __half __uint2half_ru(unsigned int i);
+__device__ __half __uint2half_rz(unsigned int i);
+__device__ __half __ull2half_rd(unsigned long long int i);
+__device__ __half __ull2half_rn(unsigned long long int i);
+__device__ __half __ull2half_ru(unsigned long long int i);
+__device__ __half __ull2half_rz(unsigned long long int i);
+__device__ __half __ushort2half_rd(unsigned short int i);
+__device__ __half __ushort2half_rn(unsigned short int i);
+__device__ __half __ushort2half_ru(unsigned short int i);
+__device__ __half __ushort2half_rz(unsigned short int i);
+__device__ __half __ushort_as_half(const unsigned short int i);

 extern "C" __half2 __hip_hc_ir_hadd2_int(__half2, __half2);
 extern "C" __half2 __hip_hc_ir_hfma2_int(__half2, __half2, __half2);
@@ -238,222 +238,202 @@ extern "C" __half2 __hip_hc_ir_h2trunc_int(__half2);
 */

 __device__ static inline __half2 __hadd2(__half2 a, __half2 b) {
-  __half2 c;
-  c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
-  return c;
+    __half2 c;
+    c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
+    return c;
 }

 __device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) {
-  __half2 c;
-  c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
-  return c;
+    __half2 c;
+    c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
+    return c;
 }

 __device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) {
-  __half2 d;
-  d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
-  return d;
+    __half2 d;
+    d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
+    return d;
 }

 __device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) {
-  __half2 d;
-  d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
-  return d;
+    __half2 d;
+    d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
+    return d;
 }

 __device__ static inline __half2 __hmul2(__half2 a, __half2 b) {
-  __half2 c;
-  c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
-  return c;
+    __half2 c;
+    c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
+    return c;
 }

 __device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) {
-  __half2 c;
-  c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
-  return c;
+    __half2 c;
+    c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
+    return c;
 }

 __device__ static inline __half2 __hsub2(__half2 a, __half2 b) {
-  __half2 c;
-  c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
-  return c;
+    __half2 c;
+    c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
+    return c;
 }

 __device__ static inline __half2 __hneg2(__half2 a) {
-  __half2 c;
-  c.x = - a.x;
-  c.y = - a.y;
-  return c;
+    __half2 c;
+    c.x = -a.x;
+    c.y = -a.y;
+    return c;
 }

 __device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) {
-  __half2 c;
-  c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
-  return c;
+    __half2 c;
+    c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
+    return c;
 }

 __device__ static inline __half2 h2div(__half2 a, __half2 b) {
-  __half2 c;
-  c.x = a.x / b.x;
-  c.y = a.y / b.y;
-  return c;
+    __half2 c;
+    c.x = a.x / b.x;
+    c.y = a.y / b.y;
+    return c;
 }


-__device__ static inline __half hceil(const __half h) {
-  return __hip_hc_ir_hceil_half(h);
-}
+__device__ static inline __half hceil(const __half h) { return __hip_hc_ir_hceil_half(h); }

-__device__ static inline __half hcos(const __half h) {
-  return __hip_hc_ir_hcos_half(h);
-}
+__device__ static inline __half hcos(const __half h) { return __hip_hc_ir_hcos_half(h); }

 __device__ static inline __half hexp(const __half h) {
-  return __hip_hc_ir_hexp2_half(__hmul(h, 1.442694));
+    return __hip_hc_ir_hexp2_half(__hmul(h, 1.442694));
 }

 __device__ static inline __half hexp10(const __half h) {
-  return __hip_hc_ir_hexp2_half(__hmul(h, 3.3219281));
+    return __hip_hc_ir_hexp2_half(__hmul(h, 3.3219281));
 }

-__device__ static inline __half hexp2(const __half h) {
-  return __hip_hc_ir_hexp2_half(h);
-}
+__device__ static inline __half hexp2(const __half h) { return __hip_hc_ir_hexp2_half(h); }

-__device__ static inline __half hfloor(const __half h) {
-  return __hip_hc_ir_hfloor_half(h);
-}
+__device__ static inline __half hfloor(const __half h) { return __hip_hc_ir_hfloor_half(h); }

 __device__ static inline __half hlog(const __half h) {
-  return __hmul(__hip_hc_ir_hlog2_half(h), 0.693147);
+    return __hmul(__hip_hc_ir_hlog2_half(h), 0.693147);
 }

 __device__ static inline __half hlog10(const __half h) {
-  return __hmul(__hip_hc_ir_hlog2_half(h),  0.301029);
+    return __hmul(__hip_hc_ir_hlog2_half(h), 0.301029);
 }

-__device__ static inline __half hlog2(const __half h) {
-  return __hip_hc_ir_hlog2_half(h);
-}
+__device__ static inline __half hlog2(const __half h) { return __hip_hc_ir_hlog2_half(h); }
 /*
 __device__ static inline __half hrcp(const __half h) {
  return __hip_hc_ir_hrcp_half(h);
 }
 */
-__device__ static inline __half hrint(const __half h) {
-  return __hip_hc_ir_hrint_half(h);
-}
+__device__ static inline __half hrint(const __half h) { return __hip_hc_ir_hrint_half(h); }

-__device__ static inline __half hrsqrt(const __half h) {
-  return __hip_hc_ir_hrsqrt_half(h);
-}
+__device__ static inline __half hrsqrt(const __half h) { return __hip_hc_ir_hrsqrt_half(h); }

-__device__ static inline __half hsin(const __half h) {
-  return __hip_hc_ir_hsin_half(h);
-}
+__device__ static inline __half hsin(const __half h) { return __hip_hc_ir_hsin_half(h); }

-__device__ static inline __half hsqrt(const __half a) {
-  return __hip_hc_ir_hsqrt_half(a);
-}
+__device__ static inline __half hsqrt(const __half a) { return __hip_hc_ir_hsqrt_half(a); }

-__device__ static inline __half htrunc(const __half a) {
-  return __hip_hc_ir_htrunc_half(a);
-}
+__device__ static inline __half htrunc(const __half a) { return __hip_hc_ir_htrunc_half(a); }

 /*
 Half2 Math Operations
 */

 __device__ static inline __half2 h2ceil(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2ceil_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2ceil_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2cos(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2cos_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2cos_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2exp(const __half2 h) {
-  __half2 factor;
-  factor.x = 1.442694;
-  factor.y = 1.442694;
-  factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
-  return factor;
+    __half2 factor;
+    factor.x = 1.442694;
+    factor.y = 1.442694;
+    factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
+    return factor;
 }

 __device__ static inline __half2 h2exp10(const __half2 h) {
-  __half2 factor;
-  factor.x = 3.3219281;
-  factor.y = 3.3219281;
-  factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
-  return factor;
+    __half2 factor;
+    factor.x = 3.3219281;
+    factor.y = 3.3219281;
+    factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
+    return factor;
 }

 __device__ static inline __half2 h2exp2(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2exp2_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2exp2_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2floor(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2floor_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2floor_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2log(const __half2 h) {
-  __half2 factor;
-  factor.x = 0.693147;
-  factor.y = 0.693147;
-  factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy);
-  return factor;
+    __half2 factor;
+    factor.x = 0.693147;
+    factor.y = 0.693147;
+    factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy);
+    return factor;
 }

 __device__ static inline __half2 h2log10(const __half2 h) {
-  __half2 factor;
-  factor.x = 0.301029;
-  factor.y = 0.301029;
-  factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy),  factor.xy);
-  return factor;
+    __half2 factor;
+    factor.x = 0.301029;
+    factor.y = 0.301029;
+    factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy);
+    return factor;
 }
 __device__ static inline __half2 h2log2(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2log2_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2log2_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2rcp(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2rcp_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2rcp_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2rsqrt(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2rsqrt_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2rsqrt_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2sin(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2sin_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2sin_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2sqrt(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2sqrt_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2sqrt_int(h.xy);
+    return a;
 }

 __device__ static inline __half2 h2trunc(const __half2 h) {
-  __half2 a;
-  a.xy = __hip_hc_ir_h2trunc_int(h.xy);
-  return a;
+    __half2 a;
+    a.xy = __hip_hc_ir_h2trunc_int(h.xy);
+    return a;
 }
-#endif //clang_major > 3
+#endif  // clang_major > 3

 #endif
@@ -28,46 +28,46 @@ THE SOFTWARE.
 #include "hip_vector_types.h"
 #include "host_defines.h"

-__device__ char                 __ldg(const char* );
-__device__ char2                __ldg(const char2* );
-__device__ char4                __ldg(const char4* );
-__device__ signed char          __ldg(const signed char* );
-__device__ unsigned char        __ldg(const unsigned char* );
+__device__ char __ldg(const char*);
+__device__ char2 __ldg(const char2*);
+__device__ char4 __ldg(const char4*);
+__device__ signed char __ldg(const signed char*);
+__device__ unsigned char __ldg(const unsigned char*);

-__device__ short                __ldg(const short* );
-__device__ short2               __ldg(const short2* );
-__device__ short4               __ldg(const short4* );
-__device__ unsigned short       __ldg(const unsigned short* );
+__device__ short __ldg(const short*);
+__device__ short2 __ldg(const short2*);
+__device__ short4 __ldg(const short4*);
+__device__ unsigned short __ldg(const unsigned short*);

-__device__ int                  __ldg(const int* );
-__device__ int2                 __ldg(const int2* );
-__device__ int4                 __ldg(const int4* );
-__device__ unsigned int         __ldg(const unsigned int* );
+__device__ int __ldg(const int*);
+__device__ int2 __ldg(const int2*);
+__device__ int4 __ldg(const int4*);
+__device__ unsigned int __ldg(const unsigned int*);


-__device__ long                 __ldg(const long* );
-__device__ unsigned long        __ldg(const unsigned long* );
+__device__ long __ldg(const long*);
+__device__ unsigned long __ldg(const unsigned long*);

-__device__ long long            __ldg(const long long* );
-__device__ longlong2            __ldg(const longlong2* );
-__device__ unsigned long long   __ldg(const unsigned long long* );
+__device__ long long __ldg(const long long*);
+__device__ longlong2 __ldg(const longlong2*);
+__device__ unsigned long long __ldg(const unsigned long long*);

-__device__ uchar2               __ldg(const uchar2* );
-__device__ uchar4               __ldg(const uchar4* );
+__device__ uchar2 __ldg(const uchar2*);
+__device__ uchar4 __ldg(const uchar4*);

-__device__ ushort2              __ldg(const ushort2* );
+__device__ ushort2 __ldg(const ushort2*);

-__device__ uint2                __ldg(const uint2* );
-__device__ uint4                __ldg(const uint4* );
+__device__ uint2 __ldg(const uint2*);
+__device__ uint4 __ldg(const uint4*);

-__device__ ulonglong2           __ldg(const ulonglong2* );
+__device__ ulonglong2 __ldg(const ulonglong2*);

-__device__ float                __ldg(const float* );
-__device__ float2               __ldg(const float2* );
-__device__ float4               __ldg(const float4* );
+__device__ float __ldg(const float*);
+__device__ float2 __ldg(const float2*);
+__device__ float4 __ldg(const float4*);

-__device__ double               __ldg(const double* );
-__device__ double2              __ldg(const double2* );
+__device__ double __ldg(const double*);
+__device__ double2 __ldg(const double2*);

 #endif  // __hcc_workweek__

@@ -39,7 +39,7 @@ THE SOFTWARE.
 #include <math.h>
 #include <string.h>
 #include <stddef.h>
-#endif//__cplusplus
+#endif  //__cplusplus

 #if __HCC__

@@ -52,7 +52,7 @@ THE SOFTWARE.

 // define HIP_ENABLE_PRINTF to enable printf
 #ifdef HIP_ENABLE_PRINTF
-  #define HCC_ENABLE_ACCELERATOR_PRINTF 1
+#define HCC_ENABLE_ACCELERATOR_PRINTF 1
 #endif

 //---
@@ -60,29 +60,28 @@ THE SOFTWARE.
 #if defined __HCC__
 #include <grid_launch.h>
 #include "hc_printf.hpp"
-//TODO-HCC-GL - change this to typedef.
-//typedef grid_launch_parm hipLaunchParm ;
+// TODO-HCC-GL - change this to typedef.
+// typedef grid_launch_parm hipLaunchParm ;

 #if GENERIC_GRID_LAUNCH == 0
-    #define hipLaunchParm grid_launch_parm
+#define hipLaunchParm grid_launch_parm
 #else
-namespace hip_impl
-{
-    struct Empty_launch_parm {};
-}
+namespace hip_impl {
+struct Empty_launch_parm {};
+}  // namespace hip_impl
 #define hipLaunchParm hip_impl::Empty_launch_parm
-#endif //GENERIC_GRID_LAUNCH
+#endif  // GENERIC_GRID_LAUNCH

-#if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
-#else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
-#error (HCC must support GRID_LAUNCH_20)
-#endif //GRID_LAUNCH_VERSION
+#if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
+#else  // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
+#error(HCC must support GRID_LAUNCH_20)
+#endif  // GRID_LAUNCH_VERSION

-#endif //HCC
+#endif  // HCC

-#if GENERIC_GRID_LAUNCH==1 && defined __HCC__
+#if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
 #include "grid_launch_GGL.hpp"
-#endif//GENERIC_GRID_LAUNCH
+#endif  // GENERIC_GRID_LAUNCH

 extern int HIP_TRACE_API;

@@ -96,66 +95,67 @@ extern int HIP_TRACE_API;
 #include <hip/hcc_detail/surface_functions.h>

 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
-#if defined (__KALMAR_ACCELERATOR__) && !defined (__HCC_ACCELERATOR__)
-#define __HCC_ACCELERATOR__  __KALMAR_ACCELERATOR__
+#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
+#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
 #endif


-
-
 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
 #if __HIP_DEVICE_COMPILE__ == 1
-   #undef assert
-   #define assert(COND) { if (!(COND)) {abort();} }
+#undef assert
+#define assert(COND)                                                                               \
+    {                                                                                              \
+        if (!(COND)) {                                                                             \
+            abort();                                                                               \
+        }                                                                                          \
+    }
 #endif


-
 // Feature tests:
 #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
 // Device compile and not host compile:

-    // 32-bit Atomics:
-#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__       (1)
-#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__   (1)
-#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__       (1)
-#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__   (1)
-#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__           (1)
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)

 // 64-bit Atomics:
-#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__       (1)
-#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__       (0)
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)

 // Doubles
-#define __HIP_ARCH_HAS_DOUBLES__                    (1)
+#define __HIP_ARCH_HAS_DOUBLES__ (1)

-//warp cross-lane operations:
-#define __HIP_ARCH_HAS_WARP_VOTE__                  (1)
-#define __HIP_ARCH_HAS_WARP_BALLOT__                (1)
-#define __HIP_ARCH_HAS_WARP_SHUFFLE__               (1)
-#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__          (0)
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)

-//sync
-#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__        (1)
-#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__            (0)
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)

 // misc
-#define __HIP_ARCH_HAS_SURFACE_FUNCS__              (0)
-#define __HIP_ARCH_HAS_3DGRID__                     (1)
-#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__           (0)
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (1)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)

 #endif /* Device feature flags */


-#define launch_bounds_impl0(requiredMaxThreadsPerBlock)\
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                            \
    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
-#define launch_bounds_impl1(\
-    requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\
-    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\
-        amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)                \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                     \
+                   amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
 #define select_impl_(_1, _2, impl_, ...) impl_
-#define __launch_bounds__(...) select_impl_(\
-    __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
+#define __launch_bounds__(...)                                                                     \
+    select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)

 // Detect if we are compiling C++ mode or C mode
 #if defined(__cplusplus)
@@ -172,114 +172,102 @@ static constexpr int warpSize = 64;
 __device__ long long int clock64();
 __device__ clock_t clock();

-//abort
+// abort
 __device__ void abort();

-//atomicAdd()
+// atomicAdd()
 __device__ int atomicAdd(int* address, int val);
-__device__ unsigned int atomicAdd(unsigned int* address,
-                       unsigned int val);
+__device__ unsigned int atomicAdd(unsigned int* address, unsigned int val);

 __device__ unsigned long long int atomicAdd(unsigned long long int* address,
-                                 unsigned long long int val);
+                                            unsigned long long int val);

 __device__ float atomicAdd(float* address, float val);


-//atomicSub()
+// atomicSub()
 __device__ int atomicSub(int* address, int val);

-__device__ unsigned int atomicSub(unsigned int* address,
-                       unsigned int val);
+__device__ unsigned int atomicSub(unsigned int* address, unsigned int val);


-//atomicExch()
+// atomicExch()
 __device__ int atomicExch(int* address, int val);

-__device__ unsigned int atomicExch(unsigned int* address,
-                        unsigned int val);
+__device__ unsigned int atomicExch(unsigned int* address, unsigned int val);

 __device__ unsigned long long int atomicExch(unsigned long long int* address,
-                                  unsigned long long int val);
+                                             unsigned long long int val);

 __device__ float atomicExch(float* address, float val);


-//atomicMin()
+// atomicMin()
 __device__ int atomicMin(int* address, int val);
-__device__ unsigned int atomicMin(unsigned int* address,
-                       unsigned int val);
+__device__ unsigned int atomicMin(unsigned int* address, unsigned int val);
 __device__ unsigned long long int atomicMin(unsigned long long int* address,
-                                 unsigned long long int val);
+                                            unsigned long long int val);


-//atomicMax()
+// atomicMax()
 __device__ int atomicMax(int* address, int val);
-__device__ unsigned int atomicMax(unsigned int* address,
-                       unsigned int val);
+__device__ unsigned int atomicMax(unsigned int* address, unsigned int val);
 __device__ unsigned long long int atomicMax(unsigned long long int* address,
-                                 unsigned long long int val);
+                                            unsigned long long int val);


-//atomicCAS()
+// atomicCAS()
 __device__ int atomicCAS(int* address, int compare, int val);
-__device__ unsigned int atomicCAS(unsigned int* address,
-                       unsigned int compare,
-                       unsigned int val);
+__device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val);
 __device__ unsigned long long int atomicCAS(unsigned long long int* address,
-                                 unsigned long long int compare,
-                                 unsigned long long int val);
+                                            unsigned long long int compare,
+                                            unsigned long long int val);


-//atomicAnd()
+// atomicAnd()
 __device__ int atomicAnd(int* address, int val);
-__device__ unsigned int atomicAnd(unsigned int* address,
-                       unsigned int val);
+__device__ unsigned int atomicAnd(unsigned int* address, unsigned int val);
 __device__ unsigned long long int atomicAnd(unsigned long long int* address,
-                                 unsigned long long int val);
+                                            unsigned long long int val);


-//atomicOr()
+// atomicOr()
 __device__ int atomicOr(int* address, int val);
-__device__ unsigned int atomicOr(unsigned int* address,
-                      unsigned int val);
+__device__ unsigned int atomicOr(unsigned int* address, unsigned int val);
 __device__ unsigned long long int atomicOr(unsigned long long int* address,
-                                unsigned long long int val);
+                                           unsigned long long int val);


-//atomicXor()
+// atomicXor()
 __device__ int atomicXor(int* address, int val);
-__device__ unsigned int atomicXor(unsigned int* address,
-                       unsigned int val);
+__device__ unsigned int atomicXor(unsigned int* address, unsigned int val);
 __device__ unsigned long long int atomicXor(unsigned long long int* address,
-                                 unsigned long long int val);
+                                            unsigned long long int val);

-//atomicInc()
-__device__ unsigned int atomicInc(unsigned int* address,
-                       unsigned int val);
+// atomicInc()
+__device__ unsigned int atomicInc(unsigned int* address, unsigned int val);


-//atomicDec()
-__device__ unsigned int atomicDec(unsigned int* address,
-                       unsigned int val);
+// atomicDec()
+__device__ unsigned int atomicDec(unsigned int* address, unsigned int val);

-                       // warp vote function __all __any __ballot
-__device__ int __all(  int input);
-__device__ int __any( int input);
-__device__  unsigned long long int __ballot( int input);
+// warp vote function __all __any __ballot
+__device__ int __all(int input);
+__device__ int __any(int input);
+__device__ unsigned long long int __ballot(int input);

 #if __HIP_ARCH_GFX701__ == 0

 // warp shuffle functions
 #ifdef __cplusplus
-__device__ int __shfl(int input, int lane, int width=warpSize);
-__device__ int __shfl_up(int input, unsigned int lane_delta, int width=warpSize);
-__device__ int __shfl_down(int input, unsigned int lane_delta, int width=warpSize);
-__device__ int __shfl_xor(int input, int lane_mask, int width=warpSize);
-__device__ float __shfl(float input, int lane, int width=warpSize);
-__device__ float __shfl_up(float input, unsigned int lane_delta, int width=warpSize);
-__device__ float __shfl_down(float input, unsigned int lane_delta, int width=warpSize);
-__device__ float __shfl_xor(float input, int lane_mask, int width=warpSize);
+__device__ int __shfl(int input, int lane, int width = warpSize);
+__device__ int __shfl_up(int input, unsigned int lane_delta, int width = warpSize);
+__device__ int __shfl_down(int input, unsigned int lane_delta, int width = warpSize);
+__device__ int __shfl_xor(int input, int lane_mask, int width = warpSize);
+__device__ float __shfl(float input, int lane, int width = warpSize);
+__device__ float __shfl_up(float input, unsigned int lane_delta, int width = warpSize);
+__device__ float __shfl_down(float input, unsigned int lane_delta, int width = warpSize);
+__device__ float __shfl_xor(float input, int lane_mask, int width = warpSize);
 #else
 __device__ int __shfl(int input, int lane, int width);
 __device__ int __shfl_up(int input, unsigned int lane_delta, int width);
@@ -289,7 +277,7 @@ __device__ float __shfl(float input, int lane, int width);
 __device__ float __shfl_up(float input, unsigned int lane_delta, int width);
 __device__ float __shfl_down(float input, unsigned int lane_delta, int width);
 __device__ float __shfl_xor(float input, int lane_mask, int width);
-#endif //__cplusplus
+#endif  //__cplusplus

 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
 __device__ float __hip_ds_bpermutef(int index, float src);
@@ -301,7 +289,7 @@ __device__ float __hip_ds_swizzlef(float src, int pattern);

 __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);

-#endif //__HIP_ARCH_GFX803__ == 1
+#endif  //__HIP_ARCH_GFX803__ == 1

 __host__ __device__ int min(int arg1, int arg2);
 __host__ __device__ int max(int arg1, int arg2);
@@ -327,17 +315,19 @@ __device__ void* __get_dynamicgroupbaseptr();
 *
 *
 *  @warning The HIP memory fence functions are currently not supported yet.
- *  If any of those threadfence stubs are reached by the application, you should set "export HSA_DISABLE_CACHE=1" to disable L1 and L2 caches.
+ *  If any of those threadfence stubs are reached by the application, you should set "export
+ *HSA_DISABLE_CACHE=1" to disable L1 and L2 caches.
 *
 *
 *  On AMD platforms, the threadfence* routines are currently empty stubs.
 */

 extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev");
-extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm("__llvm_fence_sc_wg");
+extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm(
+    "__llvm_fence_sc_wg");


- /**
+/**
 * @brief threadfence_block makes writes visible to threads running in same block.
 *
 * @Returns void
@@ -347,23 +337,21 @@ extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm
 * @warning __threadfence_block is a stub and map to no-op.
 */
 // __device__ void  __threadfence_block(void);
-__device__ static inline void __threadfence_block(void) {
-  return __hip_hc_threadfence_block();
-}
+__device__ static inline void __threadfence_block(void) { return __hip_hc_threadfence_block(); }

- /**
-  * @brief threadfence makes wirtes visible to other threads running on same GPU.
+/**
+ * @brief threadfence makes wirtes visible to other threads running on same GPU.
 *
 * @Returns void
 *
 * @param void
 *
- * @warning __threadfence is a stub and map to no-op, application should set "export HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches.
+ * @warning __threadfence is a stub and map to no-op, application should set "export
+ * HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches.
 */
-// __device__ void  __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional")));
-__device__ static inline void __threadfence(void) {
-  return __hip_hc_threadfence();
-}
+// __device__ void  __threadfence(void) __attribute__((deprecated("Provided for compile-time
+// compatibility, not yet functional")));
+__device__ static inline void __threadfence(void) { return __hip_hc_threadfence(); }

 /**
 * @brief threadfence_system makes writes to pinned system memory visible on host CPU.
@@ -374,26 +362,32 @@ __device__ static inline void __threadfence(void) {
 *
 * @warning __threadfence_system is a stub and map to no-op.
 */
-//__device__ void  __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details")));
-__device__ void  __threadfence_system(void) ;
+//__device__ void  __threadfence_system(void) __attribute__((deprecated("Provided with workaround
+//configuration, see hip_kernel_language.md for details")));
+__device__ void __threadfence_system(void);

 // doxygen end Fence Fence
 /**
 * @}
 */

-template<typename std::common_type<
-    decltype(hc_get_group_id),
-    decltype(hc_get_group_size),
-    decltype(hc_get_num_groups),
-    decltype(hc_get_workitem_id)>::type f>
+template <
+    typename std::common_type<decltype(hc_get_group_id), decltype(hc_get_group_size),
+                              decltype(hc_get_num_groups), decltype(hc_get_workitem_id)>::type f>
 class Coordinates {
    using R = decltype(f(0));

-    struct X { __device__ operator R() const { return f(0); } };
-    struct Y { __device__ operator R() const { return f(1); } };
-    struct Z { __device__ operator R() const { return f(2); } };
-public:
+    struct X {
+        __device__ operator R() const { return f(0); }
+    };
+    struct Y {
+        __device__ operator R() const { return f(1); }
+    };
+    struct Z {
+        __device__ operator R() const { return f(2); }
+    };
+
+   public:
    static constexpr X x{};
    static constexpr Y y{};
    static constexpr Z z{};
@@ -408,42 +402,34 @@ static constexpr Coordinates<hc_get_workitem_id> threadIdx;
 #define hipThreadIdx_y (hc_get_workitem_id(1))
 #define hipThreadIdx_z (hc_get_workitem_id(2))

-#define hipBlockIdx_x  (hc_get_group_id(0))
-#define hipBlockIdx_y  (hc_get_group_id(1))
-#define hipBlockIdx_z  (hc_get_group_id(2))
+#define hipBlockIdx_x (hc_get_group_id(0))
+#define hipBlockIdx_y (hc_get_group_id(1))
+#define hipBlockIdx_z (hc_get_group_id(2))

-#define hipBlockDim_x  (hc_get_group_size(0))
-#define hipBlockDim_y  (hc_get_group_size(1))
-#define hipBlockDim_z  (hc_get_group_size(2))
+#define hipBlockDim_x (hc_get_group_size(0))
+#define hipBlockDim_y (hc_get_group_size(1))
+#define hipBlockDim_z (hc_get_group_size(2))

-#define hipGridDim_x   (hc_get_num_groups(0))
-#define hipGridDim_y   (hc_get_num_groups(1))
-#define hipGridDim_z   (hc_get_num_groups(2))
+#define hipGridDim_x (hc_get_num_groups(0))
+#define hipGridDim_y (hc_get_num_groups(1))
+#define hipGridDim_z (hc_get_num_groups(2))

 extern "C" __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size);
 extern "C" __device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size);
 extern "C" __device__ void* __hip_hc_malloc(size_t);
-extern "C" __device__ void* __hip_hc_free(void *ptr);
+extern "C" __device__ void* __hip_hc_free(void* ptr);

-static inline __device__ void* malloc(size_t size)
-{
-    return __hip_hc_malloc(size);
+static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); }
+
+static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); }
+
+static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
+    return __hip_hc_memcpy(dst, src, size);
 }

-static inline __device__ void* free(void *ptr)
-{
-    return __hip_hc_free(ptr);
-}
-
-static inline __device__ void* memcpy(void* dst, const void* src, size_t size)
-{
-  return __hip_hc_memcpy(dst, src, size);
-}
-
-static inline __device__ void* memset(void* ptr, int val, size_t size)
-{
-  uint8_t val8 = static_cast <uint8_t> (val);
-  return __hip_hc_memset(ptr, val8, size);
+static inline __device__ void* memset(void* ptr, int val, size_t size) {
+    uint8_t val8 = static_cast<uint8_t>(val);
+    return __hip_hc_memset(ptr, val8, size);
 }


@@ -452,11 +438,11 @@ static inline __device__ void* memset(void* ptr, int val, size_t size)
 #ifdef HC_FEATURE_PRINTF
 template <typename... All>
 static inline __device__ void printf(const char* format, All... all) {
-  hc::printf(format, all...);
+    hc::printf(format, all...);
 }
 #else
 template <typename... All>
-static inline __device__ void printf(const char* format, All... all) { }
+static inline __device__ void printf(const char* format, All... all) {}
 #endif

 #endif
@@ -464,34 +450,40 @@ static inline __device__ void printf(const char* format, All... all) { }

 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)

-#define HIP_KERNEL_NAME(...)  (__VA_ARGS__)
+#define HIP_KERNEL_NAME(...) (__VA_ARGS__)
 #define HIP_SYMBOL(X) #X

 #if defined __HCC_CPP__
-extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
-extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
-extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
-extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
-extern void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launch_parm &lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
+                                       grid_launch_parm* lp, const char* kernelNameStr);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
+                                       grid_launch_parm* lp, const char* kernelNameStr);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
+                                       grid_launch_parm* lp, const char* kernelNameStr);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
+                                       grid_launch_parm* lp, const char* kernelNameStr);
+extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp);

 #if GENERIC_GRID_LAUNCH == 0
 //#warning "Original hipLaunchKernel defined"
-// Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be either size_t or dim3 types
-#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
-do {\
-  grid_launch_parm lp;\
-  lp.dynamic_group_mem_bytes = _groupMemBytes; \
-  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
-  _kernelName (lp, ##__VA_ARGS__);\
-  ihipPostLaunchKernel(#_kernelName, trueStream, lp);\
-} while(0)
-#endif //GENERIC_GRID_LAUNCH
+// Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be
+// either size_t or dim3 types
+#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...)      \
+    do {                                                                                           \
+        grid_launch_parm lp;                                                                       \
+        lp.dynamic_group_mem_bytes = _groupMemBytes;                                               \
+        hipStream_t trueStream =                                                                   \
+            (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName));          \
+        _kernelName(lp, ##__VA_ARGS__);                                                            \
+        ihipPostLaunchKernel(#_kernelName, trueStream, lp);                                        \
+    } while (0)
+#endif  // GENERIC_GRID_LAUNCH

-#elif defined (__HCC_C__)
+#elif defined(__HCC_C__)

-//TODO - develop C interface.
+// TODO - develop C interface.

-#endif //__HCC_CPP__
+#endif  //__HCC_CPP__

 /**
 * extern __shared__
@@ -499,21 +491,18 @@ do {\

 // Macro to replace extern __shared__ declarations
 // to local variable definitions
-#define HIP_DYNAMIC_SHARED(type, var) \
-    type* var = \
-    (type*)__get_dynamicgroupbaseptr(); \
+#define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();

 #define HIP_DYNAMIC_SHARED_ATTRIBUTE


-
 /**
 * @defgroup HIP-ENV HIP Environment Variables
 * @{
 */
-//extern int HIP_PRINT_ENV ;   ///< Print all HIP-related environment variables.
-//extern int HIP_TRACE_API;    ///< Trace HIP APIs.
-//extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
+// extern int HIP_PRINT_ENV ;   ///< Print all HIP-related environment variables.
+// extern int HIP_TRACE_API;    ///< Trace HIP APIs.
+// extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous

 /**
 * @}
@@ -532,17 +521,17 @@ do {\
 #define hipSetupArgument cudaSetupArgument
 #define hipLaunch cudaLaunch

-typedef int hipLaunchParm ;
+typedef int hipLaunchParm;

-#define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
-do {\
-  kernelName<<<numblocks,numthreads,memperblock,streamId>>>(0, ##__VA_ARGS__);\
-} while(0)
+#define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...)             \
+    do {                                                                                           \
+        kernelName<<<numblocks, numthreads, memperblock, streamId>>>(0, ##__VA_ARGS__);            \
+    } while (0)

-#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
-do {\
-  kernelName<<<numblocks,numthreads,memperblock,streamId>>>(__VA_ARGS__);\
-} while(0)
+#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...)          \
+    do {                                                                                           \
+        kernelName<<<numblocks, numthreads, memperblock, streamId>>>(__VA_ARGS__);                 \
+    } while (0)

 #include <hip/hip_runtime_api.h>

@@ -551,8 +540,8 @@ extern "C" {
 #endif /*__cplusplus*/

 hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream);
-hipError_t hipSetupArgument(const void *arg, size_t size, size_t offset);
-hipError_t hipLaunch(const void *func);
+hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
+hipError_t hipLaunch(const void* func);

 #if defined(__cplusplus)
 }
@@ -564,18 +553,18 @@ hipError_t hipLaunch(const void *func);
 #define hipThreadIdx_y threadIdx.y
 #define hipThreadIdx_z threadIdx.z

-#define hipBlockIdx_x  blockIdx.x
-#define hipBlockIdx_y  blockIdx.y
-#define hipBlockIdx_z  blockIdx.z
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z

-#define hipBlockDim_x  blockDim.x
-#define hipBlockDim_y  blockDim.y
-#define hipBlockDim_z  blockDim.z
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z

-#define hipGridDim_x  gridDim.x
-#define hipGridDim_y  gridDim.y
-#define hipGridDim_z  gridDim.z
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z

 #endif

-#endif//HIP_HCC_DETAIL_RUNTIME_H
+#endif  // HIP_HCC_DETAIL_RUNTIME_H
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_TYPES_H
 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_TYPES_H

-#include<hip/hcc_detail/driver_types.h>
+#include <hip/hcc_detail/driver_types.h>

 /**
 * An opaque value that represents a hip surface object
@@ -38,21 +38,17 @@ typedef unsigned long long hipSurfaceObject_t;
 /**
 * hip surface reference
 */
-struct surfaceReference
-{
-    hipSurfaceObject_t          surfaceObject;
+struct surfaceReference {
+    hipSurfaceObject_t surfaceObject;
 };

 /**
 * hip surface boundary modes
 */
-enum hipSurfaceBoundaryMode
-{
-    hipBoundaryModeZero   = 0,
-    hipBoundaryModeTrap   = 1,
-    hipBoundaryModeClamp  = 2
+enum hipSurfaceBoundaryMode {
+    hipBoundaryModeZero = 0,
+    hipBoundaryModeTrap = 1,
+    hipBoundaryModeClamp = 2
 };

 #endif /* !HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_TYPES_H */
-
-
@@ -29,10 +29,10 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_TEXTURE_TYPES_H

 /*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/
 #include <limits.h>
 //#include <hip/hcc_detail/driver_types.h>
 #include <hip/hcc_detail/channel_descriptor.h>
@@ -41,43 +41,37 @@ THE SOFTWARE.
 #if __cplusplus

 /*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/

-template<class T, int texType = hipTextureType1D, enum hipTextureReadMode mode = hipReadModeElementType>
-struct texture : public textureReference
-{
-    texture(int                         norm  = 0,
-            enum hipTextureFilterMode  fMode = hipFilterModePoint,
-            enum hipTextureAddressMode aMode = hipAddressModeClamp)
-    {
-        normalized     = norm;
-        filterMode     = fMode;
+template <class T, int texType = hipTextureType1D,
+          enum hipTextureReadMode mode = hipReadModeElementType>
+struct texture : public textureReference {
+    texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
+            enum hipTextureAddressMode aMode = hipAddressModeClamp) {
+        normalized = norm;
+        filterMode = fMode;
        addressMode[0] = aMode;
        addressMode[1] = aMode;
        addressMode[2] = aMode;
-        channelDesc    = hipCreateChannelDesc<T>();
-        sRGB           = 0;
+        channelDesc = hipCreateChannelDesc<T>();
+        sRGB = 0;
    }

-	texture(int                         norm,
-			enum hipTextureFilterMode   fMode,
-			enum hipTextureAddressMode  aMode,
-			struct hipChannelFormatDesc desc)
-    {
-        normalized     = norm;
-        filterMode     = fMode;
+    texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
+            struct hipChannelFormatDesc desc) {
+        normalized = norm;
+        filterMode = fMode;
        addressMode[0] = aMode;
        addressMode[1] = aMode;
        addressMode[2] = aMode;
-        channelDesc    = desc;
-        sRGB           = 0;
+        channelDesc = desc;
+        sRGB = 0;
    }
 };

 #endif /* __cplusplus */

 #endif /* !HIP_INCLUDE_HIP_HCC_DETAIL_HIP_TEXTURE_TYPES_H */
-
@@ -38,42 +38,40 @@ THE SOFTWARE.
 /**
 * Function and kernel markers
 */
-#define __host__     __attribute__((cpu))
-#define __device__   __attribute__((hc))
+#define __host__ __attribute__((cpu))
+#define __device__ __attribute__((hc))

 #if GENERIC_GRID_LAUNCH == 0
-#define __global__  __attribute__((hc_grid_launch)) __attribute__((used))
+#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
 #else
-    #if __hcc_workweek__ >= 17481
-        #define __global__ \
-            __attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
-    #else
-        #define __global__ __attribute__((hc, used))
-    #endif
-#endif //GENERIC_GRID_LAUNCH
-
-#define __noinline__      __attribute__((noinline))
-#define __forceinline__   __attribute__((always_inline))
+#if __hcc_workweek__ >= 17481
+#define __global__ __attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
+#else
+#define __global__ __attribute__((hc, used))
+#endif
+#endif  // GENERIC_GRID_LAUNCH

+#define __noinline__ __attribute__((noinline))
+#define __forceinline__ __attribute__((always_inline))


 /*
 * Variable Type Qualifiers:
 */
 // _restrict is supported by the compiler
-#define __shared__     tile_static
-#define __constant__   __attribute__((hc))
+#define __shared__ tile_static
+#define __constant__ __attribute__((hc))

 #elif defined(__clang__) && defined(__HIP__)

-#define __host__      __attribute__((host))
-#define __device__    __attribute__((device))
-#define __global__    __attribute__((global))
-#define __shared__    __attribute__((shared))
-#define __constant__  __attribute__((constant))
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))

-#define __noinline__     __attribute__((noinline))
-#define __forceinline__  __attribute__((always_inline))
+#define __noinline__ __attribute__((noinline))
+#define __forceinline__ __attribute__((always_inline))

 #else

@@ -24,7 +24,7 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_HCC_DETAIL_MATH_FUNCTIONS_H

 #if defined(__HCC__)
-   #include <kalmar_math.h>
+#include <kalmar_math.h>
 #endif


@@ -48,7 +48,7 @@ __device__ float cospif(float x);
 //__device__ float cyl_bessel_i0f(float x);
 //__device__ float cyl_bessel_i1f(float x);
 __device__ float erfcf(float x);
-__device__  float erfcinvf(float y);
+__device__ float erfcinvf(float y);
 __device__ float erfcxf(float x);
 __device__ float erff(float x);
 __device__ float erfinvf(float y);
@@ -91,11 +91,11 @@ __device__ float norm3df(float a, float b, float c);
 __device__ float norm4df(float a, float b, float c, float d);
 __device__ float normcdff(float y);
 __device__ float normcdfinvf(float y);
-__device__ float normf(int dim, const float *a);
+__device__ float normf(int dim, const float* a);
 __device__ float powf(float x, float y);
 __device__ float rcbrtf(float x);
 __device__ float remainderf(float x, float y);
-__device__ float remquof(float x, float y, int *quo);
+__device__ float remquof(float x, float y, int* quo);
 __device__ float rhypotf(float x, float y);
 __device__ float rintf(float x);
 __device__ float rnorm3df(float a, float b, float c);
@@ -106,8 +106,8 @@ __device__ float rsqrtf(float x);
 __device__ float scalblnf(float x, long int n);
 __device__ float scalbnf(float x, int n);
 __device__ int signbit(float a);
-__device__ void sincosf(float x, float *sptr, float *cptr);
-__device__ void sincospif(float x, float *sptr, float *cptr);
+__device__ void sincosf(float x, float* sptr, float* cptr);
+__device__ void sincospif(float x, float* sptr, float* cptr);
 __device__ float sinf(float x);
 __device__ float sinhf(float x);
 __device__ float sinpif(float x);
@@ -151,7 +151,7 @@ __device__ double fma(double x, double y, double z);
 __device__ double fmax(double x, double y);
 __device__ double fmin(double x, double y);
 __device__ double fmod(double x, double y);
-__device__ double frexp(double x, int *nptr);
+__device__ double frexp(double x, int* nptr);
 __device__ double hypot(double x, double y);
 __device__ int ilogb(double x);
 __device__ int isfinite(double x);
@@ -195,8 +195,8 @@ __device__ double scalbln(double x, long int n);
 __device__ double scalbn(double x, int n);
 __device__ int signbit(double a);
 __device__ double sin(double a);
-__device__ void sincos(double x, double *sptr, double *cptr);
-__device__ void sincospi(double x, double *sptr, double *cptr);
+__device__ void sincos(double x, double* sptr, double* cptr);
+__device__ void sincospi(double x, double* sptr, double* cptr);
 __device__ double sinh(double x);
 __device__ double sinpi(double x);
 __device__ double sqrt(double x);
@@ -213,45 +213,25 @@ __device__ double yn(int n, double x);
 #ifdef HIP_FAST_MATH
 // Single Precision Precise Math when enabled

-__device__ inline float cosf(float x) {
-  return __hip_fast_cosf(x);
-}
+__device__ inline float cosf(float x) { return __hip_fast_cosf(x); }

-__device__ inline float exp10f(float x) {
-  return __hip_fast_exp10f(x);
-}
+__device__ inline float exp10f(float x) { return __hip_fast_exp10f(x); }

-__device__ inline float expf(float x) {
-  return __hip_fast_expf(x);
-}
+__device__ inline float expf(float x) { return __hip_fast_expf(x); }

-__device__ inline float log10f(float x) {
-  return __hip_fast_log10f(x);
-}
+__device__ inline float log10f(float x) { return __hip_fast_log10f(x); }

-__device__ inline float log2f(float x) {
-  return __hip_fast_log2f(x);
-}
+__device__ inline float log2f(float x) { return __hip_fast_log2f(x); }

-__device__ inline float logf(float x) {
-  return __hip_fast_logf(x);
-}
+__device__ inline float logf(float x) { return __hip_fast_logf(x); }

-__device__ inline float powf(float base, float exponent) {
-  return __hip_fast_powf(base, exponent);
-}
+__device__ inline float powf(float base, float exponent) { return __hip_fast_powf(base, exponent); }

-__device__ inline void sincosf(float x, float *s, float *c) {
-  return __hip_fast_sincosf(x, s, c);
-}
+__device__ inline void sincosf(float x, float* s, float* c) { return __hip_fast_sincosf(x, s, c); }

-__device__ inline float sinf(float x) {
-  return __hip_fast_sinf(x);
-}
+__device__ inline float sinf(float x) { return __hip_fast_sinf(x); }

-__device__ inline float tanf(float x) {
-  return __hip_fast_tanf(x);
-}
+__device__ inline float tanf(float x) { return __hip_fast_tanf(x); }

 #else

@@ -36,49 +36,33 @@ THE SOFTWARE.
 struct ihipModuleSymbol_t;
 using hipFunction_t = ihipModuleSymbol_t*;

-namespace std
-{
-    template<>
-    struct hash<hsa_agent_t> {
-        size_t operator()(hsa_agent_t x) const
-        {
-            return hash<decltype(x.handle)>{}(x.handle);
-        }
-    };
-}
+namespace std {
+template <>
+struct hash<hsa_agent_t> {
+    size_t operator()(hsa_agent_t x) const { return hash<decltype(x.handle)>{}(x.handle); }
+};
+}  // namespace std

-inline
-constexpr
-bool operator==(hsa_agent_t x, hsa_agent_t y)
-{
-    return x.handle == y.handle;
-}
+inline constexpr bool operator==(hsa_agent_t x, hsa_agent_t y) { return x.handle == y.handle; }

-namespace hip_impl
-{
-    struct Kernel_descriptor {
-        std::uint64_t kernel_object_;
-        std::uint32_t group_size_;
-        std::uint32_t private_size_;
-        std::string name_;
+namespace hip_impl {
+struct Kernel_descriptor {
+    std::uint64_t kernel_object_;
+    std::uint32_t group_size_;
+    std::uint32_t private_size_;
+    std::string name_;

-        operator hipFunction_t() const
-        {   // TODO: this is awful and only meant for illustration.
-            return reinterpret_cast<hipFunction_t>(
-                const_cast<Kernel_descriptor*>(this));
-        }
-    };
+    operator hipFunction_t() const {  // TODO: this is awful and only meant for illustration.
+        return reinterpret_cast<hipFunction_t>(const_cast<Kernel_descriptor*>(this));
+    }
+};

-    const std::unordered_map<
-        hsa_agent_t, std::vector<hsa_executable_t>>& executables();
-    const std::unordered_map<
-        std::uintptr_t,
-        std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
-    const std::unordered_map<std::uintptr_t, std::string>& function_names();
-    std::unordered_map<std::string, void*>& globals();
+const std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>>& executables();
+const std::unordered_map<std::uintptr_t, std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>&
+functions();
+const std::unordered_map<std::uintptr_t, std::string>& function_names();
+std::unordered_map<std::string, void*>& globals();

-    hsa_executable_t load_executable(
-        const std::string& file,
-        hsa_executable_t executable,
-        hsa_agent_t agent);
-} // Namespace hip_impl.
+hsa_executable_t load_executable(const std::string& file, hsa_executable_t executable,
+                                 hsa_agent_t agent);
+}  // Namespace hip_impl.
@@ -29,32 +29,32 @@ THE SOFTWARE.

 #define __SURFACE_FUNCTIONS_DECL__ static __inline__ __device__
 template <class T>
-__SURFACE_FUNCTIONS_DECL__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int boundaryMode = hipBoundaryModeZero)
-{
-    hipArray* arrayPtr = (hipArray*) surfObj;
+__SURFACE_FUNCTIONS_DECL__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
+                                           int boundaryMode = hipBoundaryModeZero) {
+    hipArray* arrayPtr = (hipArray*)surfObj;
    size_t width = arrayPtr->width;
    size_t height = arrayPtr->height;
    int32_t xOffset = x / sizeof(T);
-    T* dataPtr = (T*) arrayPtr->data;
-    if((xOffset > width) || (xOffset < 0) || (y > height) ||(y < 0)) {
-        if(boundaryMode == hipBoundaryModeZero) {
+    T* dataPtr = (T*)arrayPtr->data;
+    if ((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0)) {
+        if (boundaryMode == hipBoundaryModeZero) {
            *data = 0;
        }
    } else {
-        *data = *(dataPtr + y*width + xOffset);
+        *data = *(dataPtr + y * width + xOffset);
    }
 }

 template <class T>
-__SURFACE_FUNCTIONS_DECL__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int boundaryMode = hipBoundaryModeZero)
-{
-    hipArray* arrayPtr = (hipArray*) surfObj;
+__SURFACE_FUNCTIONS_DECL__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
+                                            int boundaryMode = hipBoundaryModeZero) {
+    hipArray* arrayPtr = (hipArray*)surfObj;
    size_t width = arrayPtr->width;
    size_t height = arrayPtr->height;
    int32_t xOffset = x / sizeof(T);
-    T* dataPtr = (T*) arrayPtr->data;
-    if(!((xOffset > width) || (xOffset < 0) || (y > height) ||(y < 0))){
-        *(dataPtr +y*width + xOffset) = data;
+    T* dataPtr = (T*)arrayPtr->data;
+    if (!((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0))) {
+        *(dataPtr + y * width + xOffset) = data;
    }
 }

@@ -24,15 +24,15 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_TEXTURE_TYPES_H
 #define HIP_INCLUDE_HIP_HCC_DETAIL_TEXTURE_TYPES_H

-#include<hip/hcc_detail/driver_types.h>
+#include <hip/hcc_detail/driver_types.h>

-#define hipTextureType1D              0x01
-#define hipTextureType2D              0x02
-#define hipTextureType3D              0x03
-#define hipTextureTypeCubemap         0x0C
-#define hipTextureType1DLayered       0xF1
-#define hipTextureType2DLayered       0xF2
-#define hipTextureTypeCubemapLayered  0xFC
+#define hipTextureType1D 0x01
+#define hipTextureType2D 0x02
+#define hipTextureType3D 0x03
+#define hipTextureTypeCubemap 0x0C
+#define hipTextureType1DLayered 0xF1
+#define hipTextureType2DLayered 0xF2
+#define hipTextureTypeCubemapLayered 0xFC

 /**
 * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
@@ -50,10 +50,9 @@ typedef unsigned long long hipTextureObject_t;
 /**
 * hip texture address modes
 */
-enum hipTextureAddressMode
-{
-    hipAddressModeWrap   = 0,
-    hipAddressModeClamp  = 1,
+enum hipTextureAddressMode {
+    hipAddressModeWrap = 0,
+    hipAddressModeClamp = 1,
    hipAddressModeMirror = 2,
    hipAddressModeBorder = 3
 };
@@ -61,58 +60,48 @@ enum hipTextureAddressMode
 /**
 * hip texture filter modes
 */
-enum hipTextureFilterMode
-{
-    hipFilterModePoint  = 0,
-    hipFilterModeLinear = 1
-};
+enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };

 /**
 * hip texture read modes
 */
-enum hipTextureReadMode
-{
-    hipReadModeElementType     = 0,
-    hipReadModeNormalizedFloat = 1
-};
+enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };

 /**
 * hip texture reference
 */
-struct textureReference
-{
-    int                         normalized;
-    enum hipTextureFilterMode   filterMode;
-    enum hipTextureAddressMode  addressMode[3]; //Texture address mode for up to 3 dimensions
+struct textureReference {
+    int normalized;
+    enum hipTextureFilterMode filterMode;
+    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
    struct hipChannelFormatDesc channelDesc;
-    int                         sRGB;           // Perform sRGB->linear conversion during texture read
-    unsigned int                maxAnisotropy;  // Limit to the anisotropy ratio
-    enum hipTextureFilterMode   mipmapFilterMode;
-    float                       mipmapLevelBias;
-    float                       minMipmapLevelClamp;
-    float                       maxMipmapLevelClamp;
+    int sRGB;                    // Perform sRGB->linear conversion during texture read
+    unsigned int maxAnisotropy;  // Limit to the anisotropy ratio
+    enum hipTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;

-    hipTextureObject_t          textureObject;
-    int 						numChannels;
-    enum hipArray_Format        format;
+    hipTextureObject_t textureObject;
+    int numChannels;
+    enum hipArray_Format format;
 };

 /**
 * hip texture descriptor
 */
-struct hipTextureDesc
-{
-    enum hipTextureAddressMode  addressMode[3]; //Texture address mode for up to 3 dimensions
-    enum hipTextureFilterMode   filterMode;
-    enum hipTextureReadMode     readMode;
-    int                         sRGB;           // Perform sRGB->linear conversion during texture read
-    float                       borderColor[4];
-    int                         normalizedCoords;
-    unsigned int                maxAnisotropy;
-    enum hipTextureFilterMode   mipmapFilterMode;
-    float                       mipmapLevelBias;
-    float                       minMipmapLevelClamp;
-    float                       maxMipmapLevelClamp;
+struct hipTextureDesc {
+    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+    enum hipTextureFilterMode filterMode;
+    enum hipTextureReadMode readMode;
+    int sRGB;  // Perform sRGB->linear conversion during texture read
+    float borderColor[4];
+    int normalizedCoords;
+    unsigned int maxAnisotropy;
+    enum hipTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
 };

 #endif
@@ -28,7 +28,7 @@ THE SOFTWARE.
 // Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
 #if defined(__HCC__) || (defined(__clang__) && defined(__HIP__))
 #define __HIP_PLATFORM_HCC__
-#endif //__HCC__
+#endif  //__HCC__

 // Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
 #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__) && !defined(__HIP__))
@@ -37,42 +37,43 @@ THE SOFTWARE.
 #define __HIPCC__
 #endif

-#endif //__NVCC__
+#endif  //__NVCC__

 // Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
-#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
-  #define __HIP_DEVICE_COMPILE__ 1
+#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) ||                                  \
+    (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
+#define __HIP_DEVICE_COMPILE__ 1
 #endif

 #if __HIP_DEVICE_COMPILE__ == 0
 // 32-bit Atomics
-#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__       (0)
-#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__   (0)
-#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__       (0)
-#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__   (0)
-#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__           (0)
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)

 // 64-bit Atomics
-#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__       (0)
-#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__       (0)
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)

 // Doubles
-#define __HIP_ARCH_HAS_DOUBLES__                    (0)
+#define __HIP_ARCH_HAS_DOUBLES__ (0)

 // Warp cross-lane operations
-#define __HIP_ARCH_HAS_WARP_VOTE__                  (0)
-#define __HIP_ARCH_HAS_WARP_BALLOT__                (0)
-#define __HIP_ARCH_HAS_WARP_SHUFFLE__               (0)
-#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__          (0)
+#define __HIP_ARCH_HAS_WARP_VOTE__ (0)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)

 // Sync
-#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__        (0)
-#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__            (0)
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)

 // Misc
-#define __HIP_ARCH_HAS_SURFACE_FUNCS__              (0)
-#define __HIP_ARCH_HAS_3DGRID__                     (0)
-#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__           (0)
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (0)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
 #endif

 #endif
@@ -25,9 +25,9 @@ THE SOFTWARE.

 #include <hip/hip_common.h>

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/hip_complex.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include <hip/nvcc_detail/hip_complex.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -25,9 +25,9 @@ THE SOFTWARE.

 #include <hip/hip_common.h>

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/hip_fp16.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include "cuda_fp16.h"
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -29,16 +29,17 @@ THE SOFTWARE.

 // Forward declarations:
 namespace hc {
-    class accelerator;
-    class accelerator_view;
-};
+class accelerator;
+class accelerator_view;
+};  // namespace hc


 /**
 *-------------------------------------------------------------------------------------------------
 *-------------------------------------------------------------------------------------------------
 *  @defgroup HCC-specific features
- *  @warning These APIs provide access to special features of HCC compiler and are not available through the CUDA path.
+ *  @warning These APIs provide access to special features of HCC compiler and are not available
+ *through the CUDA path.
 *  @{
 */

@@ -47,7 +48,7 @@ namespace hc {
 * @brief Return hc::accelerator associated with the specified deviceId
 * @return #hipSuccess, #hipErrorInvalidDevice
 */
-hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc);
+hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator* acc);

 /**
 * @brief Return hc::accelerator_view associated with the specified stream
@@ -55,12 +56,12 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc);
 * If stream is 0, the accelerator_view for the default stream is returned.
 * @return #hipSuccess
 */
-hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av);
-
+hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** av);


 /**
- * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ to kernelparams or extra
 *
 * @param [in[ f	 Kernel to launch.
 * @param [in] gridDimX  X grid dimension specified in work-items
@@ -69,37 +70,36 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
 * @param [in] blockDimX X block dimensions specified in work-items
 * @param [in] blockDimY Y grid dimension specified in work-items
 * @param [in] blockDimZ Z grid dimension specified in work-items
- * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel.  The kernel can access this with HIP_DYNAMIC_SHARED.
- * @param [in] stream Stream where the kernel should be dispatched.  May be 0, in which case th default stream is used with associated synchronization rules.
- * @param [in] kernelParams 
- * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
- * @param [in] startEvent  If non-null, specified event will be updated to track the start time of the kernel launch.  The event must be created before calling this API. 
- * @param [in] stopEvent   If non-null, specified event will be updated to track the stop time of the kernel launch.  The event must be created before calling this API.
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel.  The
+ kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched.  May be 0, in which case th
+ default stream is used with associated synchronization rules.
+ * @param [in] kernelParams
+ * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and
+ must be in the memory layout and alignment expected by the kernel.
+ * @param [in] startEvent  If non-null, specified event will be updated to track the start time of
+ the kernel launch.  The event must be created before calling this API.
+ * @param [in] stopEvent   If non-null, specified event will be updated to track the stop time of
+ the kernel launch.  The event must be created before calling this API.
 *
 * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
- * 
- * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
+ *
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please
+ refer to hip_porting_driver_api.md for sample usage.

 * HIP/ROCm actually updates the start event when the associated kernel completes.
 */
-hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
-                                    uint32_t globalWorkSizeX,
-                                    uint32_t globalWorkSizeY,
-                                    uint32_t globalWorkSizeZ,
-                                    uint32_t localWorkSizeX,
-                                    uint32_t localWorkSizeY,
-                                    uint32_t localWorkSizeZ,
-                                    size_t sharedMemBytes,
-                                    hipStream_t hStream,
-                                    void **kernelParams,
-                                    void **extra,
-                                    hipEvent_t startEvent=nullptr,
-                                    hipEvent_t stopEvent=nullptr
-                                    );
+hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                    uint32_t localWorkSizeX, uint32_t localWorkSizeY,
+                                    uint32_t localWorkSizeZ, size_t sharedMemBytes,
+                                    hipStream_t hStream, void** kernelParams, void** extra,
+                                    hipEvent_t startEvent = nullptr,
+                                    hipEvent_t stopEvent = nullptr);

 // doxygen end HCC-specific features
 /**
 * @}
 */
-#endif // #ifdef __HCC__
-#endif // #ifdef HIP_INCLUDE_HIP_HIP_HCC_H
+#endif  // #ifdef __HCC__
+#endif  // #ifdef HIP_INCLUDE_HIP_HIP_HCC_H
@@ -23,13 +23,14 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H
 #define HIP_INCLUDE_HIP_HIP_PROFILE_H

-#if not defined (ENABLE_HIP_PROFILE)
+#if not defined(ENABLE_HIP_PROFILE)
 #define ENABLE_HIP_PROFILE 1
 #endif

-#if defined(__HIP_PLATFORM_HCC__) and (ENABLE_HIP_PROFILE==1)
+#if defined(__HIP_PLATFORM_HCC__) and (ENABLE_HIP_PROFILE == 1)
 #include <CXLActivityLogger.h>
-#define HIP_SCOPED_MARKER(markerName, group) amdtScopedMarker __scopedMarker(markerName, group, nullptr);
+#define HIP_SCOPED_MARKER(markerName, group)                                                       \
+    amdtScopedMarker __scopedMarker(markerName, group, nullptr);
 #define HIP_BEGIN_MARKER(markerName, group) amdtBeginMarker(markerName, group, nullptr);
 #define HIP_END_MARKER() amdtEndMarker();
 #else
@@ -30,8 +30,9 @@ THE SOFTWARE.
 //! Runtime API is C
 //! Memory management is based on pure pointers and resembles malloc/free/copy.
 //
-//! hip_runtime.h     : includes everything in hip_api.h, plus math builtins and kernel launch macros.
-//! hip_runtime_api.h : Defines HIP API.  This is a C header file and does not use any C++ features.
+//! hip_runtime.h     : includes everything in hip_api.h, plus math builtins and kernel launch
+//! macros. hip_runtime_api.h : Defines HIP API.  This is a C header file and does not use any C++
+//! features.

 #ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_H
 #define HIP_INCLUDE_HIP_HIP_RUNTIME_H
@@ -51,9 +52,9 @@ THE SOFTWARE.

 #include <hip/hip_common.h>

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/hip_runtime.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include <hip/nvcc_detail/hip_runtime.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -31,45 +31,45 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_HIP_RUNTIME_API_H


-#include <string.h> // for getDeviceProp
+#include <string.h>  // for getDeviceProp
 #include <hip/hip_common.h>

 enum {
-HIP_SUCCESS = 0,
-HIP_ERROR_INVALID_VALUE,
-HIP_ERROR_NOT_INITIALIZED,
-HIP_ERROR_LAUNCH_OUT_OF_RESOURCES
+    HIP_SUCCESS = 0,
+    HIP_ERROR_INVALID_VALUE,
+    HIP_ERROR_NOT_INITIALIZED,
+    HIP_ERROR_LAUNCH_OUT_OF_RESOURCES
 };

 typedef struct {
    // 32-bit Atomics
-    unsigned hasGlobalInt32Atomics    : 1;   ///< 32-bit integer atomics for global memory.
-    unsigned hasGlobalFloatAtomicExch : 1;   ///< 32-bit float atomic exch for global memory.
-    unsigned hasSharedInt32Atomics    : 1;   ///< 32-bit integer atomics for shared memory.
-    unsigned hasSharedFloatAtomicExch : 1;   ///< 32-bit float atomic exch for shared memory.
-    unsigned hasFloatAtomicAdd        : 1;   ///< 32-bit float atomic add in global and shared memory.
+    unsigned hasGlobalInt32Atomics : 1;     ///< 32-bit integer atomics for global memory.
+    unsigned hasGlobalFloatAtomicExch : 1;  ///< 32-bit float atomic exch for global memory.
+    unsigned hasSharedInt32Atomics : 1;     ///< 32-bit integer atomics for shared memory.
+    unsigned hasSharedFloatAtomicExch : 1;  ///< 32-bit float atomic exch for shared memory.
+    unsigned hasFloatAtomicAdd : 1;  ///< 32-bit float atomic add in global and shared memory.

    // 64-bit Atomics
-    unsigned hasGlobalInt64Atomics    : 1;   ///< 64-bit integer atomics for global memory.
-    unsigned hasSharedInt64Atomics    : 1;   ///< 64-bit integer atomics for shared memory.
+    unsigned hasGlobalInt64Atomics : 1;  ///< 64-bit integer atomics for global memory.
+    unsigned hasSharedInt64Atomics : 1;  ///< 64-bit integer atomics for shared memory.

    // Doubles
-    unsigned hasDoubles               : 1;   ///< Double-precision floating point.
+    unsigned hasDoubles : 1;  ///< Double-precision floating point.

    // Warp cross-lane operations
-    unsigned hasWarpVote              : 1;   ///< Warp vote instructions (__any, __all).
-    unsigned hasWarpBallot            : 1;   ///< Warp ballot instructions (__ballot).
-    unsigned hasWarpShuffle           : 1;   ///< Warp shuffle operations. (__shfl_*).
-    unsigned hasFunnelShift           : 1;   ///< Funnel two words into one with shift&mask caps.
+    unsigned hasWarpVote : 1;     ///< Warp vote instructions (__any, __all).
+    unsigned hasWarpBallot : 1;   ///< Warp ballot instructions (__ballot).
+    unsigned hasWarpShuffle : 1;  ///< Warp shuffle operations. (__shfl_*).
+    unsigned hasFunnelShift : 1;  ///< Funnel two words into one with shift&mask caps.

    // Sync
-    unsigned hasThreadFenceSystem     : 1;   ///< __threadfence_system.
-    unsigned hasSyncThreadsExt        : 1;   ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+    unsigned hasThreadFenceSystem : 1;  ///< __threadfence_system.
+    unsigned hasSyncThreadsExt : 1;     ///< __syncthreads_count, syncthreads_and, syncthreads_or.

    // Misc
-    unsigned hasSurfaceFuncs          : 1;   ///< Surface functions.
-    unsigned has3dGrid                : 1;   ///< Grid and group dims are 3D (rather than 2D).
-    unsigned hasDynamicParallelism    : 1;   ///< Dynamic parallelism.
+    unsigned hasSurfaceFuncs : 1;        ///< Surface functions.
+    unsigned has3dGrid : 1;              ///< Grid and group dims are 3D (rather than 2D).
+    unsigned hasDynamicParallelism : 1;  ///< Dynamic parallelism.
 } hipDeviceArch_t;


@@ -81,35 +81,40 @@ typedef struct {
 *
 */
 typedef struct hipDeviceProp_t {
-    char name[256];                             ///< Device name.
-    size_t totalGlobalMem;                      ///< Size of global memory region (in bytes).
-    size_t sharedMemPerBlock;                   ///< Size of shared memory region (in bytes).
-    int regsPerBlock;                           ///< Registers per block.
-    int warpSize;                               ///< Warp size.
-    int maxThreadsPerBlock;                     ///< Max work items per work group or workgroup max size.
-    int maxThreadsDim[3];                       ///< Max number of threads in each dimension (XYZ) of a block.
-    int maxGridSize[3];                         ///< Max grid dimensions (XYZ).
-    int clockRate;                              ///< Max clock frequency of the multiProcessors in khz.
-    int memoryClockRate;                        ///< Max global memory clock frequency in khz.
-    int memoryBusWidth;                         ///< Global memory bus width in bits.
-    size_t totalConstMem;                       ///< Size of shared memory region (in bytes).
-    int major;                                  ///< Major compute capability.  On HCC, this is an approximation and features may differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
-    int minor;                                  ///< Minor compute capability.  On HCC, this is an approximation and features may differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
-    int multiProcessorCount;                    ///< Number of multi-processors (compute units).
-    int l2CacheSize;                            ///< L2 cache size.
-    int maxThreadsPerMultiProcessor;            ///< Maximum resident threads per multi-processor.
-    int computeMode;                            ///< Compute mode.
-    int clockInstructionRate;                   ///< Frequency in khz of the timer used by the device-side "clock*" instructions.  New for HIP.
-    hipDeviceArch_t arch;                       ///< Architectural feature flags.  New for HIP.
-    int concurrentKernels;                      ///< Device can possibly execute multiple kernels concurrently.
-    int pciDomainID;                            ///< PCI Domain ID
-    int pciBusID;                               ///< PCI Bus ID.
-    int pciDeviceID;                            ///< PCI Device ID.
-    size_t maxSharedMemoryPerMultiProcessor;    ///< Maximum Shared Memory Per Multiprocessor.
-    int isMultiGpuBoard;                        ///< 1 if device is on a multi-GPU board, 0 if not.
-    int canMapHostMemory;                       ///< Check whether HIP can map host memory
-    int gcnArch;                                ///< AMD GCN Arch Value. Eg: 803, 701
- } hipDeviceProp_t;
+    char name[256];            ///< Device name.
+    size_t totalGlobalMem;     ///< Size of global memory region (in bytes).
+    size_t sharedMemPerBlock;  ///< Size of shared memory region (in bytes).
+    int regsPerBlock;          ///< Registers per block.
+    int warpSize;              ///< Warp size.
+    int maxThreadsPerBlock;    ///< Max work items per work group or workgroup max size.
+    int maxThreadsDim[3];      ///< Max number of threads in each dimension (XYZ) of a block.
+    int maxGridSize[3];        ///< Max grid dimensions (XYZ).
+    int clockRate;             ///< Max clock frequency of the multiProcessors in khz.
+    int memoryClockRate;       ///< Max global memory clock frequency in khz.
+    int memoryBusWidth;        ///< Global memory bus width in bits.
+    size_t totalConstMem;      ///< Size of shared memory region (in bytes).
+    int major;  ///< Major compute capability.  On HCC, this is an approximation and features may
+                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+                ///< feature caps.
+    int minor;  ///< Minor compute capability.  On HCC, this is an approximation and features may
+                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+                ///< feature caps.
+    int multiProcessorCount;          ///< Number of multi-processors (compute units).
+    int l2CacheSize;                  ///< L2 cache size.
+    int maxThreadsPerMultiProcessor;  ///< Maximum resident threads per multi-processor.
+    int computeMode;                  ///< Compute mode.
+    int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
+                               ///< instructions.  New for HIP.
+    hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
+    int concurrentKernels;     ///< Device can possibly execute multiple kernels concurrently.
+    int pciDomainID;           ///< PCI Domain ID
+    int pciBusID;              ///< PCI Bus ID.
+    int pciDeviceID;           ///< PCI Device ID.
+    size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per Multiprocessor.
+    int isMultiGpuBoard;                      ///< 1 if device is on a multi-GPU board, 0 if not.
+    int canMapHostMemory;                     ///< Check whether HIP can map host memory
+    int gcnArch;                              ///< AMD GCN Arch Value. Eg: 803, 701
+} hipDeviceProp_t;


 /**
@@ -117,21 +122,22 @@ typedef struct hipDeviceProp_t {
 */
 enum hipMemoryType {
    hipMemoryTypeHost,    ///< Memory is physically located on host
-    hipMemoryTypeDevice,  ///< Memory is physically located on device. (see deviceId for specific device)
-    hipMemoryTypeArray,   ///< Array memory, physically located on device. (see deviceId for specific device)
+    hipMemoryTypeDevice,  ///< Memory is physically located on device. (see deviceId for specific
+                          ///< device)
+    hipMemoryTypeArray,  ///< Array memory, physically located on device. (see deviceId for specific
+                         ///< device)
    hipMemoryTypeUnified  ///< Not used currently
 };


-
 /**
 * Pointer attributes
 */
 typedef struct hipPointerAttribute_t {
    enum hipMemoryType memoryType;
    int device;
-    void *devicePointer;
-    void *hostPointer;
+    void* devicePointer;
+    void* hostPointer;
    int isManaged;
    unsigned allocationFlags; /* flags specified when memory was allocated*/
    /* peers? */
@@ -158,74 +164,86 @@ typedef struct hipPointerAttribute_t {
 * @enum
 * @ingroup Enumerations
 */
-// Developer note - when updating these, update the hipErrorName and hipErrorString functions in NVCC and HCC paths
-// Also update the hipCUDAErrorTohipError function in NVCC path.
+// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
+// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.

 typedef enum __HIP_NODISCARD hipError_t {
-    hipSuccess                      = 0,    ///< Successful completion.
-    hipErrorOutOfMemory             = 2,
-    hipErrorNotInitialized          = 3,
-    hipErrorDeinitialized           = 4,
-    hipErrorProfilerDisabled        = 5,
-    hipErrorProfilerNotInitialized  = 6,
-    hipErrorProfilerAlreadyStarted  = 7,
-    hipErrorProfilerAlreadyStopped  = 8,
-    hipErrorInsufficientDriver      = 35,
-    hipErrorInvalidImage            = 200,
-    hipErrorInvalidContext          = 201,  ///< Produced when input context is invalid.
-    hipErrorContextAlreadyCurrent   = 202,
-    hipErrorMapFailed               = 205,
-    hipErrorUnmapFailed             = 206,
-    hipErrorArrayIsMapped           = 207,
-    hipErrorAlreadyMapped           = 208,
-    hipErrorNoBinaryForGpu          = 209,
-    hipErrorAlreadyAcquired         = 210,
-    hipErrorNotMapped               = 211,
-    hipErrorNotMappedAsArray        = 212,
-    hipErrorNotMappedAsPointer      = 213,
-    hipErrorECCNotCorrectable       = 214,
-    hipErrorUnsupportedLimit        = 215,
-    hipErrorContextAlreadyInUse     = 216,
-    hipErrorPeerAccessUnsupported   = 217,
-    hipErrorInvalidKernelFile       = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
-    hipErrorInvalidGraphicsContext  = 219,
-    hipErrorInvalidSource           = 300,
-    hipErrorFileNotFound            = 301,
+    hipSuccess = 0,  ///< Successful completion.
+    hipErrorOutOfMemory = 2,
+    hipErrorNotInitialized = 3,
+    hipErrorDeinitialized = 4,
+    hipErrorProfilerDisabled = 5,
+    hipErrorProfilerNotInitialized = 6,
+    hipErrorProfilerAlreadyStarted = 7,
+    hipErrorProfilerAlreadyStopped = 8,
+    hipErrorInsufficientDriver = 35,
+    hipErrorInvalidImage = 200,
+    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
+    hipErrorContextAlreadyCurrent = 202,
+    hipErrorMapFailed = 205,
+    hipErrorUnmapFailed = 206,
+    hipErrorArrayIsMapped = 207,
+    hipErrorAlreadyMapped = 208,
+    hipErrorNoBinaryForGpu = 209,
+    hipErrorAlreadyAcquired = 210,
+    hipErrorNotMapped = 211,
+    hipErrorNotMappedAsArray = 212,
+    hipErrorNotMappedAsPointer = 213,
+    hipErrorECCNotCorrectable = 214,
+    hipErrorUnsupportedLimit = 215,
+    hipErrorContextAlreadyInUse = 216,
+    hipErrorPeerAccessUnsupported = 217,
+    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
+    hipErrorInvalidGraphicsContext = 219,
+    hipErrorInvalidSource = 300,
+    hipErrorFileNotFound = 301,
    hipErrorSharedObjectSymbolNotFound = 302,
-    hipErrorSharedObjectInitFailed  = 303,
-    hipErrorOperatingSystem         = 304,
-    hipErrorSetOnActiveProcess      = 305,
-    hipErrorInvalidHandle           = 400,
-    hipErrorNotFound                = 500,
-    hipErrorIllegalAddress          = 700,
-    hipErrorInvalidSymbol           = 701,
-// Runtime Error Codes start here.
-    hipErrorMissingConfiguration    = 1001,
-    hipErrorMemoryAllocation        = 1002,    ///< Memory allocation error.
-    hipErrorInitializationError     = 1003,    ///< TODO comment from hipErrorInitializationError
-    hipErrorLaunchFailure           = 1004,    ///< An exception occurred on the device while executing a kernel.
-    hipErrorPriorLaunchFailure      = 1005,
-    hipErrorLaunchTimeOut           = 1006,
-    hipErrorLaunchOutOfResources    = 1007,    ///< Out of resources error.
-    hipErrorInvalidDeviceFunction   = 1008,
-    hipErrorInvalidConfiguration    = 1009,
-    hipErrorInvalidDevice           = 1010,   ///< DeviceID must be in range 0...#compute-devices.
-    hipErrorInvalidValue            = 1011,   ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range.
-    hipErrorInvalidDevicePointer    = 1017,   ///< Invalid Device Pointer
-    hipErrorInvalidMemcpyDirection  = 1021,   ///< Invalid memory copy direction
-    hipErrorUnknown                 = 1030,   ///< Unknown error.
-    hipErrorInvalidResourceHandle   = 1033,   ///< Resource handle (hipEvent_t or hipStream_t) invalid.
-    hipErrorNotReady                = 1034,   ///< Indicates that asynchronous operations enqueued earlier are not ready.  This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion).  APIs that return this error include hipEventQuery and hipStreamQuery.
-    hipErrorNoDevice                = 1038,   ///< Call to hipGetDeviceCount returned 0 devices
-    hipErrorPeerAccessAlreadyEnabled = 1050,  ///< Peer access was already enabled from the current device.
+    hipErrorSharedObjectInitFailed = 303,
+    hipErrorOperatingSystem = 304,
+    hipErrorSetOnActiveProcess = 305,
+    hipErrorInvalidHandle = 400,
+    hipErrorNotFound = 500,
+    hipErrorIllegalAddress = 700,
+    hipErrorInvalidSymbol = 701,
+    // Runtime Error Codes start here.
+    hipErrorMissingConfiguration = 1001,
+    hipErrorMemoryAllocation = 1002,     ///< Memory allocation error.
+    hipErrorInitializationError = 1003,  ///< TODO comment from hipErrorInitializationError
+    hipErrorLaunchFailure =
+        1004,  ///< An exception occurred on the device while executing a kernel.
+    hipErrorPriorLaunchFailure = 1005,
+    hipErrorLaunchTimeOut = 1006,
+    hipErrorLaunchOutOfResources = 1007,  ///< Out of resources error.
+    hipErrorInvalidDeviceFunction = 1008,
+    hipErrorInvalidConfiguration = 1009,
+    hipErrorInvalidDevice = 1010,  ///< DeviceID must be in range 0...#compute-devices.
+    hipErrorInvalidValue = 1011,   ///< One or more of the parameters passed to the API call is NULL
+                                   ///< or not in an acceptable range.
+    hipErrorInvalidDevicePointer = 1017,    ///< Invalid Device Pointer
+    hipErrorInvalidMemcpyDirection = 1021,  ///< Invalid memory copy direction
+    hipErrorUnknown = 1030,                 ///< Unknown error.
+    hipErrorInvalidResourceHandle = 1033,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
+    hipErrorNotReady = 1034,  ///< Indicates that asynchronous operations enqueued earlier are not
+                              ///< ready.  This is not actually an error, but is used to distinguish
+                              ///< from hipSuccess (which indicates completion).  APIs that return
+                              ///< this error include hipEventQuery and hipStreamQuery.
+    hipErrorNoDevice = 1038,  ///< Call to hipGetDeviceCount returned 0 devices
+    hipErrorPeerAccessAlreadyEnabled =
+        1050,  ///< Peer access was already enabled from the current device.

-    hipErrorPeerAccessNotEnabled    = 1051,   ///< Peer access was never enabled from the current device.
-    hipErrorRuntimeMemory           = 1052,                  ///< HSA runtime memory call returned error.  Typically not seen in production systems.
-    hipErrorRuntimeOther            = 1053,                   ///< HSA runtime call other than memory returned error.  Typically not seen in production systems.
-    hipErrorHostMemoryAlreadyRegistered = 1061, ///< Produced when trying to lock a page-locked memory.
-    hipErrorHostMemoryNotRegistered = 1062,   ///< Produced when trying to unlock a non-page-locked memory.
-    hipErrorMapBufferObjectFailed = 1071,   ///< Produced when the IPC memory attach failed from ROCr.
-    hipErrorTbd                             ///< Marker that more error codes are needed.
+    hipErrorPeerAccessNotEnabled =
+        1051,                      ///< Peer access was never enabled from the current device.
+    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
+                                   ///< in production systems.
+    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
+                                  ///< not seen in production systems.
+    hipErrorHostMemoryAlreadyRegistered =
+        1061,  ///< Produced when trying to lock a page-locked memory.
+    hipErrorHostMemoryNotRegistered =
+        1062,  ///< Produced when trying to unlock a non-page-locked memory.
+    hipErrorMapBufferObjectFailed =
+        1071,    ///< Produced when the IPC memory attach failed from ROCr.
+    hipErrorTbd  ///< Marker that more error codes are needed.
 } hipError_t;

 #undef __HIP_NODISCARD
@@ -236,31 +254,39 @@ typedef enum __HIP_NODISCARD hipError_t {
 * @ingroup Enumerations
 */
 typedef enum hipDeviceAttribute_t {
-    hipDeviceAttributeMaxThreadsPerBlock,                   ///< Maximum number of threads per block.
-    hipDeviceAttributeMaxBlockDimX,                         ///< Maximum x-dimension of a block.
-    hipDeviceAttributeMaxBlockDimY,                         ///< Maximum y-dimension of a block.
-    hipDeviceAttributeMaxBlockDimZ,                         ///< Maximum z-dimension of a block.
-    hipDeviceAttributeMaxGridDimX,                          ///< Maximum x-dimension of a grid.
-    hipDeviceAttributeMaxGridDimY,                          ///< Maximum y-dimension of a grid.
-    hipDeviceAttributeMaxGridDimZ,                          ///< Maximum z-dimension of a grid.
-    hipDeviceAttributeMaxSharedMemoryPerBlock,              ///< Maximum shared memory available per block in bytes.
-    hipDeviceAttributeTotalConstantMemory,                  ///< Constant memory size in bytes.
-    hipDeviceAttributeWarpSize,                             ///< Warp size in threads.
-    hipDeviceAttributeMaxRegistersPerBlock,                 ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor.
-    hipDeviceAttributeClockRate,                            ///< Peak clock frequency in kilohertz.
-    hipDeviceAttributeMemoryClockRate,                      ///< Peak memory clock frequency in kilohertz.
-    hipDeviceAttributeMemoryBusWidth,                       ///< Global memory bus width in bits.
-    hipDeviceAttributeMultiprocessorCount,                  ///< Number of multiprocessors on the device.
-    hipDeviceAttributeComputeMode,                          ///< Compute mode that device is currently in.
-    hipDeviceAttributeL2CacheSize,                          ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
-    hipDeviceAttributeMaxThreadsPerMultiProcessor,          ///< Maximum resident threads per multiprocessor.
-    hipDeviceAttributeComputeCapabilityMajor,               ///< Major compute capability version number.
-    hipDeviceAttributeComputeCapabilityMinor,               ///< Minor compute capability version number.
-    hipDeviceAttributeConcurrentKernels,                    ///< Device can possibly execute multiple kernels concurrently.
-    hipDeviceAttributePciBusId,                             ///< PCI Bus ID.
-    hipDeviceAttributePciDeviceId,                          ///< PCI Device ID.
-    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,     ///< Maximum Shared Memory Per Multiprocessor.
-    hipDeviceAttributeIsMultiGpuBoard,                      ///< Multiple GPU devices.
+    hipDeviceAttributeMaxThreadsPerBlock,       ///< Maximum number of threads per block.
+    hipDeviceAttributeMaxBlockDimX,             ///< Maximum x-dimension of a block.
+    hipDeviceAttributeMaxBlockDimY,             ///< Maximum y-dimension of a block.
+    hipDeviceAttributeMaxBlockDimZ,             ///< Maximum z-dimension of a block.
+    hipDeviceAttributeMaxGridDimX,              ///< Maximum x-dimension of a grid.
+    hipDeviceAttributeMaxGridDimY,              ///< Maximum y-dimension of a grid.
+    hipDeviceAttributeMaxGridDimZ,              ///< Maximum z-dimension of a grid.
+    hipDeviceAttributeMaxSharedMemoryPerBlock,  ///< Maximum shared memory available per block in
+                                                ///< bytes.
+    hipDeviceAttributeTotalConstantMemory,      ///< Constant memory size in bytes.
+    hipDeviceAttributeWarpSize,                 ///< Warp size in threads.
+    hipDeviceAttributeMaxRegistersPerBlock,  ///< Maximum number of 32-bit registers available to a
+                                             ///< thread block. This number is shared by all thread
+                                             ///< blocks simultaneously resident on a
+                                             ///< multiprocessor.
+    hipDeviceAttributeClockRate,             ///< Peak clock frequency in kilohertz.
+    hipDeviceAttributeMemoryClockRate,       ///< Peak memory clock frequency in kilohertz.
+    hipDeviceAttributeMemoryBusWidth,        ///< Global memory bus width in bits.
+    hipDeviceAttributeMultiprocessorCount,   ///< Number of multiprocessors on the device.
+    hipDeviceAttributeComputeMode,           ///< Compute mode that device is currently in.
+    hipDeviceAttributeL2CacheSize,  ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
+                                    ///< cache.
+    hipDeviceAttributeMaxThreadsPerMultiProcessor,  ///< Maximum resident threads per
+                                                    ///< multiprocessor.
+    hipDeviceAttributeComputeCapabilityMajor,       ///< Major compute capability version number.
+    hipDeviceAttributeComputeCapabilityMinor,       ///< Minor compute capability version number.
+    hipDeviceAttributeConcurrentKernels,  ///< Device can possibly execute multiple kernels
+                                          ///< concurrently.
+    hipDeviceAttributePciBusId,           ///< PCI Bus ID.
+    hipDeviceAttributePciDeviceId,        ///< PCI Device ID.
+    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,  ///< Maximum Shared Memory Per
+                                                         ///< Multiprocessor.
+    hipDeviceAttributeIsMultiGpuBoard,                   ///< Multiple GPU devices.
 } hipDeviceAttribute_t;


@@ -268,9 +294,9 @@ typedef enum hipDeviceAttribute_t {
 *     @}
 */

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include "hip/hcc_detail/hip_runtime_api.h"
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include "hip/nvcc_detail/hip_runtime_api.h"
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -285,16 +311,16 @@ typedef enum hipDeviceAttribute_t {
 * @see hipMalloc
 */
 #ifdef __cplusplus
-template<class T>
-static inline hipError_t hipMalloc ( T** devPtr, size_t size)
-{
+template <class T>
+static inline hipError_t hipMalloc(T** devPtr, size_t size) {
    return hipMalloc((void**)devPtr, size);
 }

-// Provide an override to automatically typecast the pointer type from void**, and also provide a default for the flags.
-template<class T>
-static inline hipError_t hipHostMalloc( T** ptr, size_t size, unsigned int flags = hipHostMallocDefault)
-{
+// Provide an override to automatically typecast the pointer type from void**, and also provide a
+// default for the flags.
+template <class T>
+static inline hipError_t hipHostMalloc(T** ptr, size_t size,
+                                       unsigned int flags = hipHostMallocDefault) {
    return hipHostMalloc((void**)ptr, size, flags);
 }
 #endif
@@ -21,13 +21,12 @@ THE SOFTWARE.
 */


-
 #ifndef HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
 #define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/hip_texture_types.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include <hip/nvcc_detail/hip_texture_types.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -28,11 +28,11 @@ THE SOFTWARE.
 #include <hip/hip_common.h>


-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #if __cplusplus
 #include <hip/hcc_detail/hip_vector_types.h>
 #endif
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include <vector_types.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -28,9 +28,9 @@ THE SOFTWARE.
 // on NVCC path:


-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/math_functions.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 //#include <hip/nvcc_detail/math_functions.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -23,6 +23,6 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_CHANNEL_DESCRIPTOR_H
 #define HIP_INCLUDE_HIP_NVCC_DETAIL_CHANNEL_DESCRIPTOR_H

-#include"channel_descriptor.h"
+#include "channel_descriptor.h"

 #endif
@@ -23,107 +23,92 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COMPLEX_H
 #define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COMPLEX_H

-#include"cuComplex.h"
+#include "cuComplex.h"

 typedef cuFloatComplex hipFloatComplex;

-__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
-    return cuCrealf(z);
-}
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }

-__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
-    return cuCimagf(z);
-}
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }

-__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
    return make_cuFloatComplex(a, b);
 }

-__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
-    return cuConjf(z);
-}
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }

-__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
    return cuCabsf(z) * cuCabsf(z);
 }

-__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
    return cuCaddf(p, q);
 }

-__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
    return cuCsubf(p, q);
 }

-__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
    return cuCmulf(p, q);
 }

-__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
    return cuCdivf(p, q);
 }

-__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
-    return cuCabsf(z);
-}
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }

 typedef cuDoubleComplex hipDoubleComplex;

-__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
-    return cuCreal(z);
-}
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }

-__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
-    return cuCimag(z);
-}
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }

-__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
    return make_cuDoubleComplex(a, b);
 }

-__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
-    return cuConj(z);
-}
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }

-__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
    return cuCabs(z) * cuCabs(z);
 }

-__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
    return cuCadd(p, q);
 }

-__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
    return cuCsub(p, q);
 }

-__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
    return cuCdiv(p, q);
 }

-__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
-    return cuCabs(z);
-}
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }

 typedef cuFloatComplex hipComplex;

-__device__ __host__ static inline hipComplex make_Complex(float x, float y){
+__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
    return make_cuComplex(x, y);
 }

-__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z){
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
    return cuComplexDoubleToFloat(z);
 }

-__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z){
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
    return cuComplexFloatToDouble(z);
 }

-__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
    return cuCfmaf(p, q, r);
 }

-__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
    return cuCfma(p, q, r);
 }

@@ -29,74 +29,72 @@ THE SOFTWARE.

 #define HIP_KERNEL_NAME(...) __VA_ARGS__

-typedef int hipLaunchParm ;
+typedef int hipLaunchParm;

-#define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
-do {\
-kernelName<<<numblocks,numthreads,memperblock,streamId>>>(0, ##__VA_ARGS__);\
-} while(0)
+#define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...)             \
+    do {                                                                                           \
+        kernelName<<<numblocks, numthreads, memperblock, streamId>>>(0, ##__VA_ARGS__);            \
+    } while (0)

-#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
-do {\
-kernelName<<<numblocks,numthreads,memperblock,streamId>>>(__VA_ARGS__);\
-} while(0)
+#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...)          \
+    do {                                                                                           \
+        kernelName<<<numblocks, numthreads, memperblock, streamId>>>(__VA_ARGS__);                 \
+    } while (0)

 #define hipReadModeElementType cudaReadModeElementType

 #ifdef __CUDA_ARCH__


-    // 32-bit Atomics:
-#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__       (__CUDA_ARCH__ >= 110)
-#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__   (__CUDA_ARCH__ >= 110)
-#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__       (__CUDA_ARCH__ >= 120)
-#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__   (__CUDA_ARCH__ >= 120)
-#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__           (__CUDA_ARCH__ >= 200)
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)

 // 64-bit Atomics:
-#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__       (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__       (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)

 // Doubles
-#define __HIP_ARCH_HAS_DOUBLES__                    (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)

-//warp cross-lane operations:
-#define __HIP_ARCH_HAS_WARP_VOTE__                  (__CUDA_ARCH__ >= 120)
-#define __HIP_ARCH_HAS_WARP_BALLOT__                (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_WARP_SHUFFLE__               (__CUDA_ARCH__ >= 300)
-#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__          (__CUDA_ARCH__ >= 350)
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)

-//sync
-#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__        (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__            (__CUDA_ARCH__ >= 200)
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)

 // misc
-#define __HIP_ARCH_HAS_SURFACE_FUNCS__              (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_3DGRID__                     (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__           (__CUDA_ARCH__ >= 350)
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)

 #endif

 #ifdef __CUDACC__


-
-
 #define hipThreadIdx_x threadIdx.x
 #define hipThreadIdx_y threadIdx.y
 #define hipThreadIdx_z threadIdx.z

-#define hipBlockIdx_x  blockIdx.x
-#define hipBlockIdx_y  blockIdx.y
-#define hipBlockIdx_z  blockIdx.z
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z

-#define hipBlockDim_x  blockDim.x
-#define hipBlockDim_y  blockDim.y
-#define hipBlockDim_z  blockDim.z
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z

-#define hipGridDim_x  gridDim.x
-#define hipGridDim_y  gridDim.y
-#define hipGridDim_z  gridDim.z
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z

 #define HIP_SYMBOL(X) X

@@ -104,15 +102,20 @@ kernelName<<<numblocks,numthreads,memperblock,streamId>>>(__VA_ARGS__);\
 * extern __shared__
 */

-#define HIP_DYNAMIC_SHARED(type, var) \
-    extern __shared__ type var[]; \
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];

 #define HIP_DYNAMIC_SHARED_ATTRIBUTE

 #ifdef __HIP_DEVICE_COMPILE__
-#define abort() {asm("trap;");}
+#define abort()                                                                                    \
+    { asm("trap;"); }
 #undef assert
-#define assert(COND) { if (!COND) {abort();} }
+#define assert(COND)                                                                               \
+    {                                                                                              \
+        if (!COND) {                                                                               \
+            abort();                                                                               \
+        }                                                                                          \
+    }
 #endif

 #endif
@@ -25,9 +25,9 @@ THE SOFTWARE.

 #include <hip/hip_common.h>

-#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
 #include <hip/hcc_detail/texture_types.h>
-#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
 #include "texture_types.h"
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
@@ -11,8 +11,7 @@ using namespace clara;
 using namespace hip_impl;
 using namespace std;

-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
    try {
        bool help = false;
        vector<string> inputs;
@@ -24,7 +23,8 @@ int main(int argc, char** argv)

        if (!r) throw runtime_error{r.errorMessage()};

-        if (help) cout << cmd << endl;
+        if (help)
+            cout << cmd << endl;
        else {
            if (inputs.empty()) throw runtime_error{"No inputs specified."};

@@ -33,13 +33,12 @@ int main(int argc, char** argv)
            auto tmp = tokenize_targets(targets);
            if (tmp.empty()) {
                tmp.assign(amdgpu_targets().cbegin(), amdgpu_targets().cend());
-            }
-            else validate_targets(tmp);
+            } else
+                validate_targets(tmp);

            extract_code_objects(inputs, tmp);
        }
-    }
-    catch (const exception& ex) {
+    } catch (const exception& ex) {
        cerr << ex.what() << endl;

        return EXIT_FAILURE;
@@ -13,92 +13,67 @@
 #include <string>
 #include <vector>

-namespace hip_impl
-{
-    inline
-    clara::Parser cmdline_parser(
-        bool& help,
-        std::vector<std::string>& inputs,
-        std::string& targets)
-    {
-        return
-            clara::Help{help} |
-            clara::Arg{inputs, "a" + fat_binary_extension() + " etc."}(
-                "fat binaries which contain the code objects to be extracted; "
-                "the binary format of the file(s) is documented at: "
-                "https://reviews.llvm.org/D13909; "
-                "the code object format is documented at: "
-                "https://www.llvm.org/docs/AMDGPUUsage.html#code-object.") |
-            clara::Opt{targets, "gfx803,gfx900 etc."}
-                ["-t"]["--targets"](
-                    "targets for which code objects are to be extracted from "
-                    "the fat binary; must be included in the set of processors "
-                    "with ROCm support from "
-                    "https://www.llvm.org/docs/AMDGPUUsage.html#processors.");
-    }
+namespace hip_impl {
+inline clara::Parser cmdline_parser(bool& help, std::vector<std::string>& inputs,
+                                    std::string& targets) {
+    return clara::Help{help} |
+           clara::Arg{inputs, "a" + fat_binary_extension() + " etc."}(
+               "fat binaries which contain the code objects to be extracted; "
+               "the binary format of the file(s) is documented at: "
+               "https://reviews.llvm.org/D13909; "
+               "the code object format is documented at: "
+               "https://www.llvm.org/docs/AMDGPUUsage.html#code-object.") |
+           clara::Opt{targets, "gfx803,gfx900 etc."}["-t"]["--targets"](
+               "targets for which code objects are to be extracted from "
+               "the fat binary; must be included in the set of processors "
+               "with ROCm support from "
+               "https://www.llvm.org/docs/AMDGPUUsage.html#processors.");
+}

-    inline
-    std::string make_code_object_file_name(
-        const std::string& input, const std::string& target)
-    {
-        assert(!input.empty() && !target.empty());
+inline std::string make_code_object_file_name(const std::string& input, const std::string& target) {
+    assert(!input.empty() && !target.empty());

-        auto r = input.substr(0, input.find(fat_binary_extension()));
-        r += '_' + target + code_object_extension();
+    auto r = input.substr(0, input.find(fat_binary_extension()));
+    r += '_' + target + code_object_extension();

-        return r;
-    }
+    return r;
+}

-    inline
-    void extract_code_objects(
-        const std::vector<std::string>& inputs,
-        const std::vector<std::string>& targets)
-    {
-        for (auto&& input : inputs) {
-            std::ifstream tmp{input};
-            std::vector<char> bundle{
-                std::istreambuf_iterator<char>{tmp},
-                std::istreambuf_iterator<char>{}};
+inline void extract_code_objects(const std::vector<std::string>& inputs,
+                                 const std::vector<std::string>& targets) {
+    for (auto&& input : inputs) {
+        std::ifstream tmp{input};
+        std::vector<char> bundle{std::istreambuf_iterator<char>{tmp},
+                                 std::istreambuf_iterator<char>{}};

-            Bundled_code_header tmp1{bundle};
+        Bundled_code_header tmp1{bundle};

-            if (!valid(tmp1)) {
-                throw std::runtime_error{input + " is not a valid fat binary."};
+        if (!valid(tmp1)) {
+            throw std::runtime_error{input + " is not a valid fat binary."};
+        }
+
+        for (auto&& target : targets) {
+            const auto it = std::find_if(
+                bundles(tmp1).cbegin(), bundles(tmp1).cend(),
+                [&](const Bundled_code& x) { return x.triple.find(target) != std::string::npos; });
+
+            if (it == bundles(tmp1).cend()) {
+                std::cerr << "Warning: " << input << " does not contain code for the " << target
+                          << " target.";
+                continue;
            }

-            for (auto&& target : targets) {
-                const auto it = std::find_if(
-                    bundles(tmp1).cbegin(),
-                    bundles(tmp1).cend(),
-                    [&](const Bundled_code& x) {
-                        return x.triple.find(target) != std::string::npos;
-                });
-
-                if (it == bundles(tmp1).cend()) {
-                    std::cerr << "Warning: " << input
-                        << " does not contain code for the " << target
-                        << " target.";
-                    continue;
-                }
-
-                std::ofstream out{make_code_object_file_name(input, target)};
-                std::copy_n(
-                    it->blob.cbegin(),
-                    it->blob.size(),
-                    std::ostreambuf_iterator<char>{out});
-            }
+            std::ofstream out{make_code_object_file_name(input, target)};
+            std::copy_n(it->blob.cbegin(), it->blob.size(), std::ostreambuf_iterator<char>{out});
        }
    }
+}

-    inline
-    void validate_inputs(const std::vector<std::string>& inputs)
-    {
-        const auto it = std::find_if_not(
-            inputs.cbegin(), inputs.cend(), file_exists);
+inline void validate_inputs(const std::vector<std::string>& inputs) {
+    const auto it = std::find_if_not(inputs.cbegin(), inputs.cend(), file_exists);

-        if (it != inputs.cend()) {
-            throw std::runtime_error{
-                "Non existent file " + *it + " passed as input."};
-        }
+    if (it != inputs.cend()) {
+        throw std::runtime_error{"Non existent file " + *it + " passed as input."};
    }
-}
+}
+}  // namespace hip_impl
@@ -8,86 +8,73 @@
 #include <unordered_set>
 #include <vector>

-namespace hip_impl
-{
-    inline
-    const std::unordered_set<std::string>& amdgpu_targets()
-    {   // The evolving list lives at:
-        // https://www.llvm.org/docs/AMDGPUUsage.html#processors.
-        static const std::unordered_set<std::string> r{
-            "gfx701", "gfx801", "gfx802", "gfx803", "gfx900"};
+namespace hip_impl {
+inline const std::unordered_set<std::string>& amdgpu_targets() {  // The evolving list lives at:
+    // https://www.llvm.org/docs/AMDGPUUsage.html#processors.
+    static const std::unordered_set<std::string> r{"gfx701", "gfx801", "gfx802", "gfx803",
+                                                   "gfx900"};

-        return r;
+    return r;
+}
+
+inline const std::string& code_object_extension() {
+    static const std::string r{".ffa"};
+
+    return r;
+}
+
+inline const std::string& fat_binary_extension() {
+    static const std::string r{".adipose"};
+
+    return r;
+}
+
+inline bool file_exists(const std::string& path_to) {
+    return static_cast<bool>(std::ifstream{path_to});
+}
+
+inline std::vector<std::string> tokenize_targets(
+    const std::string&
+        x) {  // TODO: move to regular expressions once we clarify the need to support
+    //       ancient standard library implementations.
+    if (x.empty()) return {};
+
+    static constexpr const char valid_characters[] = "gfx0123456789,";
+
+    if (x.find_first_not_of(valid_characters) != std::string::npos) {
+        throw std::runtime_error{"Invalid target string: " + x};
    }

-    inline
-    const std::string& code_object_extension()
-    {
-        static const std::string r{".ffa"};
+    std::vector<std::string> r;

-        return r;
-    }
+    auto it = x.cbegin();
+    do {
+        auto it1 = std::find(it, x.cend(), ',');
+        r.emplace_back(it, it1);

-    inline
-    const std::string& fat_binary_extension()
-    {
-        static const std::string r{".adipose"};
+        if (it1 == x.cend()) break;

-        return r;
-    }
+        it = ++it1;
+    } while (true);

-    inline
-    bool file_exists(const std::string& path_to)
-    {
-        return static_cast<bool>(std::ifstream{path_to});
-    }
+    return r;
+}

-    inline
-    std::vector<std::string> tokenize_targets(const std::string& x)
-    {   // TODO: move to regular expressions once we clarify the need to support
-        //       ancient standard library implementations.
-        if (x.empty()) return {};
+inline void validate_targets(const std::vector<std::string>& x) {
+    assert(!x.empty());

-        static constexpr const char valid_characters[] = "gfx0123456789,";
+    for (auto&& t : x) {
+        static const std::string digits{"0123456789"};
+        static const std::string pre{"gfx"};

-        if (x.find_first_not_of(valid_characters) != std::string::npos) {
-            throw std::runtime_error{"Invalid target string: " + x};
+        if (t.find(pre) != 0 || t.find_first_not_of(digits, pre.size()) != std::string::npos) {
+            throw std::runtime_error{"Invalid target: " + t};
        }

-        std::vector<std::string> r;
-
-        auto it = x.cbegin();
-        do {
-            auto it1 = std::find(it, x.cend(), ',');
-            r.emplace_back(it, it1);
-
-            if (it1 == x.cend()) break;
-
-            it = ++it1;
-        } while (true);
-
-        return r;
-    }
-
-    inline
-    void validate_targets(const std::vector<std::string>& x)
-    {
-        assert(!x.empty());
-
-        for (auto&& t : x) {
-            static const std::string digits{"0123456789"};
-            static const std::string pre{"gfx"};
-
-            if (t.find(pre) != 0 ||
-                t.find_first_not_of(digits, pre.size()) != std::string::npos) {
-                throw std::runtime_error{"Invalid target: " + t};
-            }
-
-            if (amdgpu_targets().find(t) == amdgpu_targets().cend()) {
-                std::cerr << "Warning: target " << t
-                    << " has not been validated yet; it may be invalid."
-                    << std::endl;
-            }
+        if (amdgpu_targets().find(t) == amdgpu_targets().cend()) {
+            std::cerr << "Warning: target " << t
+                      << " has not been validated yet; it may be invalid." << std::endl;
        }
    }
-} // Namespace hip_impl.
+}
+}  // Namespace hip_impl.
@@ -11,12 +11,10 @@ using namespace clara;
 using namespace hip_impl;
 using namespace std;

-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
    try {
        if (!hipcc_and_lpl_colocated()) {
-            throw runtime_error{
-                "The LPL executable and hipcc must be in the same directory."};
+            throw runtime_error{"The LPL executable and hipcc must be in the same directory."};
        }

        bool help = false;
@@ -31,22 +29,23 @@ int main(int argc, char** argv)

        if (!r) throw runtime_error{r.errorMessage()};

-        if (help) cout << cmd << endl;
+        if (help)
+            cout << cmd << endl;
        else {
            if (sources.empty()) throw runtime_error{"No inputs specified."};

            auto tmp = tokenize_targets(targets);
            if (tmp.empty()) {
                tmp.assign(amdgpu_targets().cbegin(), amdgpu_targets().cend());
-            }
-            else validate_targets(tmp);
+            } else
+                validate_targets(tmp);

-            if (output.empty()) for (auto&& x : tmp) output += x;
+            if (output.empty())
+                for (auto&& x : tmp) output += x;

            generate_fat_binary(sources, tmp, flags, output);
        }
-    }
-    catch (const exception& ex) {
+    } catch (const exception& ex) {
        cerr << ex.what() << endl;

        return EXIT_FAILURE;
@@ -18,164 +18,123 @@
 #include <utility>
 #include <vector>

-namespace hip_impl
-{
-    inline
-    const std::string& kernel_section()
-    {
-        static const std::string r{".kernel"};
+namespace hip_impl {
+inline const std::string& kernel_section() {
+    static const std::string r{".kernel"};

-        return r;
+    return r;
+}
+
+inline const std::string& path_to_self() {
+    static constexpr const char self[] = "/proc/self/exe";
+
+    static std::string r;
+    static std::once_flag f;
+
+    std::call_once(f, []() {
+        using N = decltype(readlink(self, &r.front(), r.size()));
+
+        constexpr decltype(r.size()) max_path_sz{PATH_MAX};
+        N read_cnt;
+        do {
+            r.resize(std::max(2 * r.size(), max_path_sz));
+            read_cnt = readlink(self, &r.front(), r.size());
+        } while (read_cnt == -1 && r.size() < r.max_size());
+
+        r.resize(std::max(read_cnt, N{0}));
+    });
+
+    return r;
+}
+
+inline const std::string& path_to_hipcc() {
+    assert(!path_to_self().empty());
+
+    static const auto r = path_to_self().substr(0, path_to_self().find_last_of('/')) += "/hipcc";
+
+    return r;
+}
+
+inline std::string make_hipcc_call(const std::vector<std::string>& sources,
+                                   const std::vector<std::string>& targets,
+                                   const std::string& flags, const std::string& hipcc_output) {
+    assert(!sources.empty() && !targets.empty() && !hipcc_output.empty());
+
+    std::string r{path_to_hipcc() + ' '};
+
+    for (auto&& x : sources) r += x + ' ';
+    r += "-o " + hipcc_output + ' ';
+    for (auto&& x : targets) r += "--amdgpu-target=" + x + ' ';
+    r += flags + " -fPIC -shared";
+
+    return r;
+}
+
+inline void copy_kernel_section_to_fat_binary(const std::string& tmp, const std::string& output) {
+    ELFIO::elfio reader;
+    if (!reader.load(tmp)) {
+        throw std::runtime_error{"The result of the compilation is inaccessible."};
    }

-    inline
-    const std::string& path_to_self()
-    {
-        static constexpr const char self[] = "/proc/self/exe";
+    const auto it =
+        std::find_if(reader.sections.begin(), reader.sections.end(),
+                     [](const ELFIO::section* x) { return x->get_name() == kernel_section(); });

-        static std::string r;
-        static std::once_flag f;
+    std::ofstream out{output + fat_binary_extension()};

-        std::call_once(f, []() {
-            using N = decltype(readlink(self, &r.front(), r.size()));
+    if (it == reader.sections.end()) {
+        std::cerr << "Warning: no kernels were generated; fat binary shall "
+                     "be empty."
+                  << std::endl;
+    } else {
+        std::copy_n((*it)->get_data(), (*it)->get_size(), std::ostreambuf_iterator<char>{out});
+    }
+}

-            constexpr decltype(r.size()) max_path_sz{PATH_MAX};
-            N read_cnt;
-            do {
-                r.resize(std::max(2 * r.size(), max_path_sz));
-                read_cnt = readlink(self, &r.front(), r.size());
-            } while (read_cnt == -1 && r.size() < r.max_size());
+inline void generate_fat_binary(const std::vector<std::string>& sources,
+                                const std::vector<std::string>& targets, const std::string& flags,
+                                const std::string& output) {
+    static const auto d = [](const std::string* f) { remove(f->c_str()); };

-            r.resize(std::max(read_cnt, N{0}));
-        });
+    std::unique_ptr<const std::string, decltype(d)> tmp{&output, d};

-        return r;
+    redi::ipstream hipcc{make_hipcc_call(sources, targets, flags, *tmp), redi::pstream::pstderr};
+
+    if (!hipcc.is_open()) {
+        throw std::runtime_error{"Compiler invocation failed."};
    }

-    inline
-    const std::string& path_to_hipcc()
-    {
-        assert(!path_to_self().empty());
+    std::string log;
+    while (std::getline(hipcc, log)) std::cout << log << '\n';

-        static const auto r = path_to_self().substr(
-            0, path_to_self().find_last_of('/')) += "/hipcc";
+    hipcc.close();

-        return r;
+    if (hipcc.rdbuf()->exited() && hipcc.rdbuf()->status() != EXIT_SUCCESS) {
+        throw std::runtime_error{"Compilation failed."};
    }

-    inline
-    std::string make_hipcc_call(
-        const std::vector<std::string>& sources,
-        const std::vector<std::string>& targets,
-        const std::string& flags,
-        const std::string& hipcc_output)
-    {
-        assert(!sources.empty() && !targets.empty() && !hipcc_output.empty());
+    copy_kernel_section_to_fat_binary(*tmp, output);
+}

-        std::string r{path_to_hipcc() + ' '};
+inline bool hipcc_and_lpl_colocated() {
+    if (path_to_self().empty()) return false;

-        for (auto&& x : sources) r += x + ' ';
-        r += "-o " + hipcc_output + ' ';
-        for (auto&& x : targets) r += "--amdgpu-target=" + x + ' ';
-        r += flags + " -fPIC -shared";
+    return file_exists(path_to_hipcc());
+}

-        return r;
-    }
-
-    inline
-    void copy_kernel_section_to_fat_binary(
-        const std::string& tmp, const std::string& output)
-    {
-        ELFIO::elfio reader;
-        if (!reader.load(tmp)) {
-            throw std::runtime_error{
-                "The result of the compilation is inaccessible."};
-        }
-
-        const auto it = std::find_if(
-            reader.sections.begin(),
-            reader.sections.end(),
-            [](const ELFIO::section* x) {
-                return x->get_name() == kernel_section();
-        });
-
-        std::ofstream out{output + fat_binary_extension()};
-
-        if (it == reader.sections.end()) {
-            std::cerr << "Warning: no kernels were generated; fat binary shall "
-                "be empty." << std::endl;
-        }
-        else {
-            std::copy_n(
-                (*it)->get_data(),
-                (*it)->get_size(),
-                std::ostreambuf_iterator<char>{out});
-        }
-    }
-
-    inline
-    void generate_fat_binary(
-        const std::vector<std::string>& sources,
-        const std::vector<std::string>& targets,
-        const std::string& flags,
-        const std::string& output)
-    {
-        static const auto d = [](const std::string* f) { remove(f->c_str()); };
-
-        std::unique_ptr<const std::string, decltype(d)> tmp{&output, d};
-
-        redi::ipstream hipcc{
-            make_hipcc_call(sources, targets, flags, *tmp),
-            redi::pstream::pstderr};
-
-        if (!hipcc.is_open()) {
-            throw std::runtime_error{"Compiler invocation failed."};
-        }
-
-        std::string log;
-        while (std::getline(hipcc, log)) std::cout << log << '\n';
-
-        hipcc.close();
-
-        if (hipcc.rdbuf()->exited() &&
-            hipcc.rdbuf()->status() != EXIT_SUCCESS) {
-            throw std::runtime_error{"Compilation failed."};
-        }
-
-        copy_kernel_section_to_fat_binary(*tmp, output);
-    }
-
-    inline
-    bool hipcc_and_lpl_colocated()
-    {
-        if (path_to_self().empty()) return false;
-
-        return file_exists(path_to_hipcc());
-    }
-
-    inline
-    clara::Parser cmdline_parser(
-        bool& help,
-        std::vector<std::string>& sources,
-        std::string& targets,
-        std::string& flags,
-        std::string& output)
-    {
-        return
-            clara::Opt{flags, "\"-v -DMACRO etc.\""}
-                ["-f"]["--flags"](
-                    "flags for compilation; must be valid for hipcc.") |
-            clara::Help{help} |
-            clara::Opt{output, "filename"}
-                ["-o"]["--output"](
-                    "name of fat-binary output file; the binary format of the "
-                    "file is documented at: https://reviews.llvm.org/D13909.") |
-            clara::Arg{sources, "a.cpp b.cpp etc."}(
-                "inputs for compilation; must contain valid C++ code.") |
-            clara::Opt{targets, "gfx803,gfx900 etc."}
-                ["-t"]["--targets"](
-                    "targets for AMDGPU lowering; must be included in the set "
-                    "of processors with ROCm support from "
-                    "https://www.llvm.org/docs/AMDGPUUsage.html#processors.");
-    }
-}
+inline clara::Parser cmdline_parser(bool& help, std::vector<std::string>& sources,
+                                    std::string& targets, std::string& flags, std::string& output) {
+    return clara::Opt{flags, "\"-v -DMACRO etc.\""}["-f"]["--flags"](
+               "flags for compilation; must be valid for hipcc.") |
+           clara::Help{help} |
+           clara::Opt{output, "filename"}["-o"]["--output"](
+               "name of fat-binary output file; the binary format of the "
+               "file is documented at: https://reviews.llvm.org/D13909.") |
+           clara::Arg{sources,
+                      "a.cpp b.cpp etc."}("inputs for compilation; must contain valid C++ code.") |
+           clara::Opt{targets, "gfx803,gfx900 etc."}["-t"]["--targets"](
+               "targets for AMDGPU lowering; must be included in the set "
+               "of processors with ROCm support from "
+               "https://www.llvm.org/docs/AMDGPUUsage.html#processors.");
+}
+}  // namespace hip_impl
@@ -28,79 +28,76 @@ THE SOFTWARE.
 #endif


-#define CHECK(cmd) \
-{\
-    hipError_t error  = cmd;\
-    if (error != hipSuccess) { \
-      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
-    exit(EXIT_FAILURE);\
-    }\
-}
+#define CHECK(cmd)                                                                                 \
+    {                                                                                              \
+        hipError_t error = cmd;                                                                    \
+        if (error != hipSuccess) {                                                                 \
+            fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,         \
+                    __FILE__, __LINE__);                                                           \
+            exit(EXIT_FAILURE);                                                                    \
+        }                                                                                          \
+    }

-__global__ void
-bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N)
-{
+__global__ void bit_extract_kernel(hipLaunchParm lp, uint32_t* C_d, const uint32_t* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t stride = hipBlockDim_x * hipGridDim_x;

-    for (size_t i=offset; i<N; i+=stride) {
+    for (size_t i = offset; i < N; i += stride) {
 #ifdef __HIP_PLATFORM_HCC__
        C_d[i] = hc::__bitextract_u32(A_d[i], 8, 4);
 #else /* defined __HIP_PLATFORM_NVCC__ or other path */
-        C_d[i] = ((A_d[i] & 0xf00)  >> 8);
+        C_d[i] = ((A_d[i] & 0xf00) >> 8);
 #endif
    }
 }


-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    uint32_t *A_d, *C_d;
    uint32_t *A_h, *C_h;
    size_t N = 1000000;
    size_t Nbytes = N * sizeof(uint32_t);

    int deviceId;
-    CHECK (hipGetDevice(&deviceId));
+    CHECK(hipGetDevice(&deviceId));
    hipDeviceProp_t props;
    CHECK(hipGetDeviceProperties(&props, deviceId));
-    printf ("info: running on device #%d %s\n", deviceId, props.name);
+    printf("info: running on device #%d %s\n", deviceId, props.name);


-    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    printf("info: allocate host mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
    A_h = (uint32_t*)malloc(Nbytes);
-    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess);
    C_h = (uint32_t*)malloc(Nbytes);
-    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess);

-    for (size_t i=0; i<N; i++)
-    {
+    for (size_t i = 0; i < N; i++) {
        A_h[i] = i;
    }

-    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    printf("info: allocate device mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
    CHECK(hipMalloc(&A_d, Nbytes));
    CHECK(hipMalloc(&C_d, Nbytes));

-    printf ("info: copy Host2Device\n");
-    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+    printf("info: copy Host2Device\n");
+    CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));

-    printf ("info: launch 'bit_extract_kernel' \n");
+    printf("info: launch 'bit_extract_kernel' \n");
    const unsigned blocks = 512;
    const unsigned threadsPerBlock = 256;
-    hipLaunchKernel(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0,   C_d, A_d, N);
+    hipLaunchKernel(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

-    printf ("info: copy Device2Host\n");
-    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+    printf("info: copy Device2Host\n");
+    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

-    printf ("info: check result\n");
-    for (size_t i=0; i<N; i++)  {
+    printf("info: check result\n");
+    for (size_t i = 0; i < N; i++) {
        unsigned Agold = ((A_h[i] & 0xf00) >> 8);
        if (C_h[i] != Agold) {
-            fprintf (stderr, "mismatch detected.\n");
-            printf ("%zu: %08x =? %08x (Ain=%08x)\n", i, C_h[i], Agold, A_h[i]);
+            fprintf(stderr, "mismatch detected.\n");
+            printf("%zu: %08x =? %08x (Ain=%08x)\n", i, C_h[i], Agold, A_h[i]);
            CHECK(hipErrorUnknown);
        }
    }
-    printf ("PASSED!\n");
+    printf("PASSED!\n");
 }
@@ -26,8 +26,8 @@ THE SOFTWARE.
 // will automatically copy data to and from the host, without the user needing
 // to manually perform such copies.  This is an excellent mode for developers
 // new to GPU programming and matches the memory models provided by recent systems where
-// CPU and GPU share the same memory pool.  Advanced programmers may prefer 
-// more explicit control over the data movement - shown in the other vadd_hc_array and 
+// CPU and GPU share the same memory pool.  Advanced programmers may prefer
+// more explicit control over the data movement - shown in the other vadd_hc_array and
 // vadd_hc_am examples.
 // This example shows the similarity between C++AMP and and HC for simple cases where
 // implicit data transfer is used - really the only difference is the namespace.
@@ -35,8 +35,7 @@ THE SOFTWARE.

 #include <amp.h>

-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    int sizeElements = 1000000;
    bool pass = true;

@@ -46,28 +45,30 @@ int main(int argc, char *argv[])
    concurrency::array_view<float> C(sizeElements);

    // Initialize host data
-    for (int i=0; i<sizeElements; i++) {
-        A[i] = 1.618f * i; 
+    for (int i = 0; i < sizeElements; i++) {
+        A[i] = 1.618f * i;
        B[i] = 3.142f * i;
    }
-    C.discard_data(); // tell runtime not to copy CPU host data.
+    C.discard_data();  // tell runtime not to copy CPU host data.


    // Launch kernel onto default accelerator
-    // The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
-    concurrency::parallel_for_each(concurrency::extent<1> (sizeElements),
-      [=] (concurrency::index<1> idx) restrict(amp) { 
-        int i = idx[0];
-        C[i] = A[i] + B[i];
-    });
+    // The HCC runtime will ensure that A and B are available on the accelerator before launching
+    // the kernel.
+    concurrency::parallel_for_each(concurrency::extent<1>(sizeElements),
+                                   [=](concurrency::index<1> idx) restrict(amp) {
+                                       int i = idx[0];
+                                       C[i] = A[i] + B[i];
+                                   });

-    for (int i=0; i<sizeElements; i++) {
-        float ref= 1.618f * i + 3.142f * i;
-        // Because C is an array_view, the HCC runtime will copy C back to host at first access here:
+    for (int i = 0; i < sizeElements; i++) {
+        float ref = 1.618f * i + 3.142f * i;
+        // Because C is an array_view, the HCC runtime will copy C back to host at first access
+        // here:
        if (C[i] != ref) {
-            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
+            printf("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
            pass = false;
        }
    };
-    if (pass) printf ("PASSED!\n");
+    if (pass) printf("PASSED!\n");
 }
@@ -24,21 +24,20 @@ THE SOFTWARE.
 // AM provides a set of c-style memory management routines for allocating,
 // freeing, and copying memory.   am_alloc returns a device pointer
 // which can only be used on the device.  The programmer has full control
-// over when data is copied.  
+// over when data is copied.

 #include <hc.hpp>
 #include <hc_am.hpp>

-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    int sizeElements = 1000000;
    size_t sizeBytes = sizeElements * sizeof(float);
    bool pass = true;

    // Allocate host memory
-    float *A_h = (float*)malloc(sizeBytes);
-    float *B_h = (float*)malloc(sizeBytes);
-    float *C_h = (float*)malloc(sizeBytes);
+    float* A_h = (float*)malloc(sizeBytes);
+    float* B_h = (float*)malloc(sizeBytes);
+    float* C_h = (float*)malloc(sizeBytes);

    // Allocate device pointers:
    // Unlike array_view, these must be explicitly managed by user:
@@ -51,36 +50,37 @@ int main(int argc, char *argv[])
    C_d = hc::am_alloc(sizeBytes, acc, 0);

    // Initialize host data
-    for (int i=0; i<sizeElements; i++) {
-        A_h[i] = 1.618f * i; 
+    for (int i = 0; i < sizeElements; i++) {
+        A_h[i] = 1.618f * i;
        B_h[i] = 3.142f * i;
        C_h[i] = 0;
    }

-    av.copy(A_h, A_d, sizeBytes); // C++ copy H2D
-    av.copy(B_h, B_d, sizeBytes); // C++ copy H2D
+    av.copy(A_h, A_d, sizeBytes);  // C++ copy H2D
+    av.copy(B_h, B_d, sizeBytes);  // C++ copy H2D

-    // Launch kernel onto AV.  
+    // Launch kernel onto AV.
    // Because the kernel PFE and the copies are submitted to same AV, they will execute in order
-    // and we don't need additional synchronization to ensure the copies complete before the PFE begins.
-    hc::completion_future cf=
-    hc::parallel_for_each(av,  hc::extent<1> (sizeElements),
-      [=] (hc::index<1> idx) [[hc]] { 
-        int i = idx[0];
-        C_d[i] = A_d[i] + B_d[i];
-    });
-
-   
-    // This copy is in same AV as the kernel and thus will wait for the kernel to finish before executing.
-    av.copy(C_d, C_h, sizeBytes); // C++ copy D2H
+    // and we don't need additional synchronization to ensure the copies complete before the PFE
+    // begins.
+    hc::completion_future cf =
+        hc::parallel_for_each(av, hc::extent<1>(sizeElements), [=](hc::index<1> idx)[[hc]] {
+            int i = idx[0];
+            C_d[i] = A_d[i] + B_d[i];
+        });


-    for (int i=0; i<sizeElements; i++) {
-        float ref= 1.618f * i + 3.142f * i;
+    // This copy is in same AV as the kernel and thus will wait for the kernel to finish before
+    // executing.
+    av.copy(C_d, C_h, sizeBytes);  // C++ copy D2H
+
+
+    for (int i = 0; i < sizeElements; i++) {
+        float ref = 1.618f * i + 3.142f * i;
        if (C_h[i] != ref) {
-            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
+            printf("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
            pass = false;
        }
    };
-    if (pass) printf ("PASSED!\n");
+    if (pass) printf("PASSED!\n");
 }
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */

 // Simple test showing how to use HC syntax with array.
-// Array provides a type-safe C++ mechanism to allocate accelerator memory.  
+// Array provides a type-safe C++ mechanism to allocate accelerator memory.
 // Like array_view, hc::array provides multi-dimensional indexing capability,
 // and is typed.  However, unlike array_view, hc::array does not provide
 // automatic data management capabilities - instead the programmer
@@ -29,16 +29,15 @@ THE SOFTWARE.

 #include <hc.hpp>

-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    int sizeElements = 1000000;
    size_t sizeBytes = sizeElements * sizeof(float);
    bool pass = true;

    // Allocate host memory
-    float *A_h = (float*)malloc(sizeBytes);
-    float *B_h = (float*)malloc(sizeBytes);
-    float *C_h = (float*)malloc(sizeBytes);
+    float* A_h = (float*)malloc(sizeBytes);
+    float* B_h = (float*)malloc(sizeBytes);
+    float* C_h = (float*)malloc(sizeBytes);

    // Allocate device arrays<>
    // Unlike array_view, these must be explicitly managed by user:
@@ -47,32 +46,32 @@ int main(int argc, char *argv[])
    hc::array<float> C_d(sizeElements);

    // Initialize host data
-    for (int i=0; i<sizeElements; i++) {
-        A_h[i] = 1.618f * i; 
+    for (int i = 0; i < sizeElements; i++) {
+        A_h[i] = 1.618f * i;
        B_h[i] = 3.142f * i;
    }

-    hc::copy(A_h, A_d); // C++ copy H2D
-    hc::copy(B_h, B_d); // C++ copy H2D
+    hc::copy(A_h, A_d);  // C++ copy H2D
+    hc::copy(B_h, B_d);  // C++ copy H2D

    // Launch kernel onto default accelerator:
    // array<> types are not implicitly copied, so we performed copies above.
-    hc::parallel_for_each(hc::extent<1> (sizeElements),
-      [&] (hc::index<1> idx) [[hc]] { 
+    hc::parallel_for_each(hc::extent<1>(sizeElements), [&](hc::index<1> idx)[[hc]] {
        int i = idx[0];
        C_d[i] = A_d[i] + B_d[i];
    });

-    // HCC runtime knows that C_d depends on previous PFE and will force the copy to wait for the PFE to complte.
-    hc::copy(C_d, C_h); // C++ copy D2H
+    // HCC runtime knows that C_d depends on previous PFE and will force the copy to wait for the
+    // PFE to complte.
+    hc::copy(C_d, C_h);  // C++ copy D2H


-    for (int i=0; i<sizeElements; i++) {
-        float ref= 1.618f * i + 3.142f * i;
+    for (int i = 0; i < sizeElements; i++) {
+        float ref = 1.618f * i + 3.142f * i;
        if (C_h[i] != ref) {
-            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
+            printf("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
            pass = false;
        }
    };
-    if (pass) printf ("PASSED!\n");
+    if (pass) printf("PASSED!\n");
 }
@@ -26,8 +26,8 @@ THE SOFTWARE.
 // will automatically copy data to and from the host, without the user needing
 // to manually perform such copies.  This is an excellent mode for developers
 // new to GPU programming and matches the memory models provided by recent systems where
-// CPU and GPU share the same memory pool.  Advanced programmers may prefer 
-// more explicit control over the data movement - shown in the other vadd_hc_array and 
+// CPU and GPU share the same memory pool.  Advanced programmers may prefer
+// more explicit control over the data movement - shown in the other vadd_hc_array and
 // vadd_hc_am examples.
 // This example shows the similarity between C++AMP and and HC for simple cases where
 // implicit data transfer is used - really the only difference is the namespace.
@@ -35,8 +35,7 @@ THE SOFTWARE.

 #include <hc.hpp>

-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    int sizeElements = 1000000;
    bool pass = true;

@@ -46,28 +45,29 @@ int main(int argc, char *argv[])
    hc::array_view<float> C(sizeElements);

    // Initialize host data
-    for (int i=0; i<sizeElements; i++) {
-        A[i] = 1.618f * i; 
+    for (int i = 0; i < sizeElements; i++) {
+        A[i] = 1.618f * i;
        B[i] = 3.142f * i;
    }
-    C.discard_data(); // tell runtime not to copy CPU host data.
+    C.discard_data();  // tell runtime not to copy CPU host data.


    // Launch kernel onto default accelerator:
-    // The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
-    hc::parallel_for_each(hc::extent<1> (sizeElements),
-      [=] (hc::index<1> idx) [[hc]] { 
+    // The HCC runtime will ensure that A and B are available on the accelerator before launching
+    // the kernel.
+    hc::parallel_for_each(hc::extent<1>(sizeElements), [=](hc::index<1> idx)[[hc]] {
        int i = idx[0];
        C[i] = A[i] + B[i];
    });

-    for (int i=0; i<sizeElements; i++) {
-        float ref= 1.618f * i + 3.142f * i;
-        // Because C is an array_view, the HCC runtime will copy C back to host at first access here:
+    for (int i = 0; i < sizeElements; i++) {
+        float ref = 1.618f * i + 3.142f * i;
+        // Because C is an array_view, the HCC runtime will copy C back to host at first access
+        // here:
        if (C[i] != ref) {
-            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
+            printf("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
            pass = false;
        }
    };
-    if (pass) printf ("PASSED!\n");
+    if (pass) printf("PASSED!\n");
 }
@@ -22,8 +22,7 @@ THE SOFTWARE.

 #include "hip/hip_runtime.h"

-__global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float *c, int N)
-{
+__global__ void vadd_hip(hipLaunchParm lp, const float* a, const float* b, float* c, int N) {
    int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);

    if (idx < N) {
@@ -32,16 +31,15 @@ __global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float
 }


-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    int sizeElements = 1000000;
    size_t sizeBytes = sizeElements * sizeof(float);
    bool pass = true;

    // Allocate host memory
-    float *A_h = (float*)malloc(sizeBytes);
-    float *B_h = (float*)malloc(sizeBytes);
-    float *C_h = (float*)malloc(sizeBytes);
+    float* A_h = (float*)malloc(sizeBytes);
+    float* B_h = (float*)malloc(sizeBytes);
+    float* C_h = (float*)malloc(sizeBytes);

    // Allocate device memory:
    float *A_d, *B_d, *C_d;
@@ -50,8 +48,8 @@ int main(int argc, char *argv[])
    hipMalloc(&C_d, sizeBytes);

    // Initialize host memory
-    for (int i=0; i<sizeElements; i++) {
-        A_h[i] = 1.618f * i; 
+    for (int i = 0; i < sizeElements; i++) {
+        A_h[i] = 1.618f * i;
        B_h[i] = 3.142f * i;
    }

@@ -60,20 +58,20 @@ int main(int argc, char *argv[])
    hipMemcpy(B_d, B_h, sizeBytes, hipMemcpyHostToDevice);

    // Launch kernel onto default accelerator
-    int blockSize = 256;  // pick arbitrary block size
-    int blocks = (sizeElements+blockSize-1)/blockSize; // round up to launch enough blocks
+    int blockSize = 256;                                      // pick arbitrary block size
+    int blocks = (sizeElements + blockSize - 1) / blockSize;  // round up to launch enough blocks
    hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);

    // D2H Copy
    hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);

    // Verify
-    for (int i=0; i<sizeElements; i++) {
-        float ref= 1.618f * i + 3.142f * i;
+    for (int i = 0; i < sizeElements; i++) {
+        float ref = 1.618f * i + 3.142f * i;
        if (C_h[i] != ref) {
-            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
+            printf("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
            pass = false;
        }
    };
-    if (pass) printf ("PASSED!\n");
+    if (pass) printf("PASSED!\n");
 }
@@ -22,25 +22,25 @@ THE SOFTWARE.

 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
-#include<iostream>
-#include<fstream>
-#include<vector>
+#include <iostream>
+#include <fstream>
+#include <vector>

 #define LEN 64
-#define SIZE LEN<<2
+#define SIZE LEN << 2

 #define fileName "test.co"
 #define kernel_name "vadd"

-int main(){
+int main() {
    float *A, *B, *C;
    hipDeviceptr_t Ad, Bd, Cd;
    A = new float[LEN];
    B = new float[LEN];
    C = new float[LEN];

-    for(uint32_t i=0;i<LEN;i++){
-        A[i] = i*1.0f;
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = i * 1.0f;
        B[i] = 1.0f;
        C[i] = 0.0f;
    }
@@ -65,16 +65,16 @@ int main(){
    hipModuleGetFunction(&Function, Module, kernel_name);

    int n = LEN;
-    void * args[4] = {&Ad, &Bd, &Cd, &n};
+    void* args[4] = {&Ad, &Bd, &Cd, &n};

    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, args, nullptr);

    hipMemcpyDtoH(C, Cd, SIZE);
    int mismatchCount = 0;
-    for(uint32_t i=0;i<LEN;i++){
+    for (uint32_t i = 0; i < LEN; i++) {
        if (A[i] + B[i] != C[i]) {
            mismatchCount++;
-            std::cout<<"error: mismatch " << A[i]<<" + "<<B[i]<<" != "<<C[i]<<std::endl;
+            std::cout << "error: mismatch " << A[i] << " + " << B[i] << " != " << C[i] << std::endl;
        }
    }

@@ -21,7 +21,6 @@ THE SOFTWARE.
 */


-
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 #include <iostream>
@@ -33,22 +32,25 @@ THE SOFTWARE.
 #endif

 #define LEN 64
-#define SIZE LEN<<2
+#define SIZE LEN << 2

 #define fileName "vcpy_kernel.code.adipose"
 #define kernel_name "hello_world"

-#define HIP_CHECK(status) \
-if(status != hipSuccess) {std::cout<<"Got Status: "<<status<<" at Line: "<<__LINE__<<std::endl;exit(0);}
+#define HIP_CHECK(status)                                                                          \
+    if (status != hipSuccess) {                                                                    \
+        std::cout << "Got Status: " << status << " at Line: " << __LINE__ << std::endl;            \
+        exit(0);                                                                                   \
+    }

-int main(){
+int main() {
    float *A, *B;
    hipDeviceptr_t Ad, Bd;
    A = new float[LEN];
    B = new float[LEN];

-    for(uint32_t i=0;i<LEN;i++){
-        A[i] = i*1.0f;
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = i * 1.0f;
        B[i] = 0.0f;
    }

@@ -68,36 +70,33 @@ int main(){
    HIP_CHECK(hipModuleLoad(&Module, fileName));
    HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));

-		uint32_t len = LEN;
-		uint32_t one = 1;
+    uint32_t len = LEN;
+    uint32_t one = 1;

    struct {
-        void * _Ad;
-        void * _Bd;
+        void* _Ad;
+        void* _Bd;
    } args;

    args._Ad = Ad;
    args._Bd = Bd;


-
    size_t size = sizeof(args);

-    void *config[] = {
-      HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
-      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
-      HIP_LAUNCH_PARAM_END
-    };
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                      HIP_LAUNCH_PARAM_END};

-    HIP_CHECK(hipHccModuleLaunchKernel(Function, LEN, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config));
+    HIP_CHECK(
+        hipHccModuleLaunchKernel(Function, LEN, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config));

    hipMemcpyDtoH(B, Bd, SIZE);

    int mismatchCount = 0;
-    for(uint32_t i=0;i<LEN;i++){
+    for (uint32_t i = 0; i < LEN; i++) {
        if (A[i] != B[i]) {
            mismatchCount++;
-            std::cout<<"error: mismatch " << A[i]<<" != "<<B[i]<<std::endl;
+            std::cout << "error: mismatch " << A[i] << " != " << B[i] << std::endl;
        }
    }

@@ -28,22 +28,25 @@ THE SOFTWARE.
 #include <hip/hip_hcc.h>

 #define LEN 64
-#define SIZE LEN<<2
+#define SIZE LEN << 2

 #define fileName "vcpy_kernel.code.adipose"
 #define kernel_name "hello_world"

-#define HIP_CHECK(status) \
-if(status != hipSuccess) {std::cout<<"Got Status: "<<status<<" at Line: "<<__LINE__<<std::endl;exit(0);}
+#define HIP_CHECK(status)                                                                          \
+    if (status != hipSuccess) {                                                                    \
+        std::cout << "Got Status: " << status << " at Line: " << __LINE__ << std::endl;            \
+        exit(0);                                                                                   \
+    }

-int main(){
+int main() {
    float *A, *B;
    hipDeviceptr_t Ad, Bd;
    A = new float[LEN];
    B = new float[LEN];

-    for(uint32_t i=0;i<LEN;i++){
-        A[i] = i*1.0f;
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = i * 1.0f;
        B[i] = 0.0f;
    }

@@ -64,12 +67,12 @@ int main(){
    HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));

 #ifdef __HIP_PLATFORM_HCC__
-		uint32_t len = LEN;
-		uint32_t one = 1;
+    uint32_t len = LEN;
+    uint32_t one = 1;

    struct {
-        void * _Ad;
-        void * _Bd;
+        void* _Ad;
+        void* _Bd;
    } args;

    args._Ad = Ad;
@@ -80,8 +83,8 @@ int main(){
 #ifdef __HIP_PLATFORM_NVCC__
    struct {
        uint32_t _hidden[1];
-        void * _Ad;
-        void * _Bd;
+        void* _Ad;
+        void* _Bd;
    } args;

    args._hidden[0] = 0;
@@ -92,21 +95,18 @@ int main(){

    size_t size = sizeof(args);

-    void *config[] = {
-      HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
-      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
-      HIP_LAUNCH_PARAM_END
-    };
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                      HIP_LAUNCH_PARAM_END};

    HIP_CHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config));

    hipMemcpyDtoH(B, Bd, SIZE);

    int mismatchCount = 0;
-    for(uint32_t i=0;i<LEN;i++){
+    for (uint32_t i = 0; i < LEN; i++) {
        if (A[i] != B[i]) {
            mismatchCount++;
-            std::cout<<"error: mismatch " << A[i]<<" != "<<B[i]<<std::endl;
+            std::cout << "error: mismatch " << A[i] << " != " << B[i] << std::endl;
        }
    }

@@ -22,8 +22,7 @@ THE SOFTWARE.

 #include "hip/hip_runtime.h"

-extern "C" __global__ void hello_world(float *a, float *b)
-{
+extern "C" __global__ void hello_world(float* a, float* b) {
    int tx = hipThreadIdx_x;
    b[tx] = a[tx];
 }
@@ -28,25 +28,29 @@ THE SOFTWARE.
 #include <hip/hip_hcc.h>

 #define LEN 64
-#define SIZE LEN*sizeof(float)
+#define SIZE LEN * sizeof(float)

 #define fileName "vcpy_kernel.code.adipose"
 float myDeviceGlobal;
 float myDeviceGlobalArray[16];
-#define HIP_CHECK(cmd) \
-{\
-    hipError_t status = cmd;\
-    if(status != hipSuccess) {std::cout<<"error: #"<<status<<" ("<< hipGetErrorString(status) << ") at line:"<<__LINE__<<":  "<<#cmd<<std::endl;abort();}\
-}
+#define HIP_CHECK(cmd)                                                                             \
+    {                                                                                              \
+        hipError_t status = cmd;                                                                   \
+        if (status != hipSuccess) {                                                                \
+            std::cout << "error: #" << status << " (" << hipGetErrorString(status)                 \
+                      << ") at line:" << __LINE__ << ":  " << #cmd << std::endl;                   \
+            abort();                                                                               \
+        }                                                                                          \
+    }

-int main(){
+int main() {
    float *A, *B;
-    float* Ad, *Bd;
+    float *Ad, *Bd;
    A = new float[LEN];
    B = new float[LEN];

-    for(uint32_t i=0;i<LEN;i++){
-        A[i] = i*1.0f;
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = i * 1.0f;
        B[i] = 0.0f;
    }

@@ -70,18 +74,18 @@ int main(){
 #define ARRAY_SIZE 16

    float myDeviceGlobalArray_h[ARRAY_SIZE];
-    for (int i=0; i<ARRAY_SIZE; i++) {
-        myDeviceGlobalArray_h[i] = i*1000.0f;
-        myDeviceGlobalArray[i] = i*1000.0f;
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+        myDeviceGlobalArray_h[i] = i * 1000.0f;
+        myDeviceGlobalArray[i] = i * 1000.0f;
    }

 #ifdef __HIP_PLATFORM_HCC__
-		uint32_t len = LEN;
-		uint32_t one = 1;
+    uint32_t len = LEN;
+    uint32_t one = 1;

    struct {
-        void * _Ad;
-        void * _Bd;
+        void* _Ad;
+        void* _Bd;
    } args;

    args._Ad = Ad;
@@ -92,8 +96,8 @@ int main(){
 #ifdef __HIP_PLATFORM_NVCC__
    struct {
        uint32_t _hidden[1];
-        void * _Ad;
-        void * _Bd;
+        void* _Ad;
+        void* _Bd;
    } args;

    args._hidden[0] = 0;
@@ -104,11 +108,8 @@ int main(){

    size_t size = sizeof(args);

-    void *config[] = {
-      HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
-      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
-      HIP_LAUNCH_PARAM_END
-    };
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                      HIP_LAUNCH_PARAM_END};

    {
        hipFunction_t Function;
@@ -118,10 +119,10 @@ int main(){
        hipMemcpyDtoH(B, Bd, SIZE);

        int mismatchCount = 0;
-        for(uint32_t i=0;i<LEN;i++){
+        for (uint32_t i = 0; i < LEN; i++) {
            if (A[i] != B[i]) {
                mismatchCount++;
-                std::cout<<"error: mismatch " << A[i]<<" != "<<B[i]<<std::endl;
+                std::cout << "error: mismatch " << A[i] << " != " << B[i] << std::endl;
                if (mismatchCount >= 10) {
                    break;
                }
@@ -143,11 +144,11 @@ int main(){
        hipMemcpyDtoH(B, Bd, SIZE);

        int mismatchCount = 0;
-        for(uint32_t i=0;i<LEN;i++){
-            float expected = A[i] + myDeviceGlobal_h + myDeviceGlobalArray_h[i%16];
+        for (uint32_t i = 0; i < LEN; i++) {
+            float expected = A[i] + myDeviceGlobal_h + myDeviceGlobalArray_h[i % 16];
            if (expected != B[i]) {
                mismatchCount++;
-                std::cout<<"error: mismatch " << expected <<" != "<<B[i]<<std::endl;
+                std::cout << "error: mismatch " << expected << " != " << B[i] << std::endl;
                if (mismatchCount >= 10) {
                    break;
                }
@@ -25,17 +25,15 @@ THE SOFTWARE.
 #define ARRAY_SIZE (16)

 extern float myDeviceGlobal;
-extern float myDeviceGlobalArray[16];;
+extern float myDeviceGlobalArray[16];
+;

-extern "C" __global__ void hello_world(const float *a, float *b)
-{
+extern "C" __global__ void hello_world(const float* a, float* b) {
    int tx = hipThreadIdx_x;
    b[tx] = a[tx];
 }

-extern "C" __global__ void test_globals(const float *a, float *b)
-{
+extern "C" __global__ void test_globals(const float* a, float* b) {
    int tx = hipThreadIdx_x;
-    b[tx] = a[tx] + myDeviceGlobal+ myDeviceGlobalArray[tx%ARRAY_SIZE] ;
+    b[tx] = a[tx] + myDeviceGlobal + myDeviceGlobalArray[tx % ARRAY_SIZE];
 }
-
@@ -23,33 +23,31 @@ THE SOFTWARE.
 #include <stdio.h>
 #include "hip/hip_runtime.h"

-#define CHECK(cmd) \
-{\
-    hipError_t error  = cmd;\
-    if (error != hipSuccess) { \
-      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
-    exit(EXIT_FAILURE);\
-	}\
-}
+#define CHECK(cmd)                                                                                 \
+    {                                                                                              \
+        hipError_t error = cmd;                                                                    \
+        if (error != hipSuccess) {                                                                 \
+            fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,         \
+                    __FILE__, __LINE__);                                                           \
+            exit(EXIT_FAILURE);                                                                    \
+        }                                                                                          \
+    }

 /*
 * Square each element in the array A and write to array C.
 */
 template <typename T>
-__global__ void
-vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N)
-{
+__global__ void vector_square(hipLaunchParm lp, T* C_d, const T* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t stride = hipBlockDim_x * hipGridDim_x;

-    for (size_t i=offset; i<N; i+=stride) {
+    for (size_t i = offset; i < N; i += stride) {
        C_d[i] = A_d[i] * A_d[i];
    }
 }


-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    float *A_d, *C_d;
    float *A_h, *C_h;
    size_t N = 1000000;
@@ -57,43 +55,42 @@ int main(int argc, char *argv[])
    static int device = 0;
    CHECK(hipSetDevice(device));
    hipDeviceProp_t props;
-    CHECK(hipGetDeviceProperties(&props, device/*deviceID*/));
-    printf ("info: running on device %s\n", props.name);
-    #ifdef __HIP_PLATFORM_HCC__
-      printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
-    #endif
-    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    CHECK(hipGetDeviceProperties(&props, device /*deviceID*/));
+    printf("info: running on device %s\n", props.name);
+#ifdef __HIP_PLATFORM_HCC__
+    printf("info: architecture on AMD GPU device is: %d\n", props.gcnArch);
+#endif
+    printf("info: allocate host mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
    A_h = (float*)malloc(Nbytes);
-    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess);
    C_h = (float*)malloc(Nbytes);
-    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
-	// Fill with Phi + i
-    for (size_t i=0; i<N; i++)
-    {
+    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess);
+    // Fill with Phi + i
+    for (size_t i = 0; i < N; i++) {
        A_h[i] = 1.618f + i;
    }

-    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    printf("info: allocate device mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
    CHECK(hipMalloc(&A_d, Nbytes));
    CHECK(hipMalloc(&C_d, Nbytes));

-    printf ("info: copy Host2Device\n");
-    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+    printf("info: copy Host2Device\n");
+    CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));

    const unsigned blocks = 512;
    const unsigned threadsPerBlock = 256;

-    printf ("info: launch 'vector_square' kernel\n");
+    printf("info: launch 'vector_square' kernel\n");
    hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

-    printf ("info: copy Device2Host\n");
-    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+    printf("info: copy Device2Host\n");
+    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

-    printf ("info: check result\n");
-    for (size_t i=0; i<N; i++)  {
+    printf("info: check result\n");
+    for (size_t i = 0; i < N; i++) {
        if (C_h[i] != A_h[i] * A_h[i]) {
            CHECK(hipErrorUnknown);
        }
    }
-    printf ("PASSED!\n");
+    printf("PASSED!\n");
 }
@@ -10,96 +10,72 @@ using namespace std;
 #define SORT_RETAIN_ATTS_ORDER 1


-bool ResultDatabase::Result::operator<(const Result &rhs) const
-{
-    if (test < rhs.test)
-        return true;
-    if (test > rhs.test)
-        return false;
-#if (SORT_RETAIN_ATTS_ORDER == 0) 
+bool ResultDatabase::Result::operator<(const Result& rhs) const {
+    if (test < rhs.test) return true;
+    if (test > rhs.test) return false;
+#if (SORT_RETAIN_ATTS_ORDER == 0)
    // For ties, sort by the value of the attribute:
-    if (atts < rhs.atts)
-        return true;
-    if (atts > rhs.atts)
-        return false;
+    if (atts < rhs.atts) return true;
+    if (atts > rhs.atts) return false;
 #endif
-    return false; // less-operator returns false on equal
+    return false;  // less-operator returns false on equal
 }

-double ResultDatabase::Result::GetMin() const
-{
+double ResultDatabase::Result::GetMin() const {
    double r = FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r = min(r, value[i]);
    }
    return r;
 }

-double ResultDatabase::Result::GetMax() const
-{
+double ResultDatabase::Result::GetMax() const {
    double r = -FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r = max(r, value[i]);
    }
    return r;
 }

-double ResultDatabase::Result::GetMedian() const
-{
-    return GetPercentile(50);
-}
+double ResultDatabase::Result::GetMedian() const { return GetPercentile(50); }

-double ResultDatabase::Result::GetPercentile(double q) const
-{
+double ResultDatabase::Result::GetPercentile(double q) const {
    int n = value.size();
-    if (n == 0)
-        return FLT_MAX;
-    if (n == 1)
-        return value[0];
+    if (n == 0) return FLT_MAX;
+    if (n == 1) return value[0];

-    if (q <= 0)
-        return value[0];
-    if (q >= 100)
-        return value[n-1];
+    if (q <= 0) return value[0];
+    if (q >= 100) return value[n - 1];

    double index = ((n + 1.) * q / 100.) - 1;

    vector<double> sorted = value;
    sort(sorted.begin(), sorted.end());

-    if (n == 2)
-        return (sorted[0] * (1 - q/100.)  +  sorted[1] * (q/100.));
+    if (n == 2) return (sorted[0] * (1 - q / 100.) + sorted[1] * (q / 100.));

    int index_lo = int(index);
    double frac = index - index_lo;
-    if (frac == 0)
-        return sorted[index_lo];
+    if (frac == 0) return sorted[index_lo];

    double lo = sorted[index_lo];
    double hi = sorted[index_lo + 1];
-    return lo + (hi-lo)*frac;
+    return lo + (hi - lo) * frac;
 }

-double ResultDatabase::Result::GetMean() const
-{
+double ResultDatabase::Result::GetMean() const {
    double r = 0;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r += value[i];
    }
    return r / double(value.size());
 }

-double ResultDatabase::Result::GetStdDev() const
-{
+double ResultDatabase::Result::GetStdDev() const {
    double r = 0;
    double u = GetMean();
-    if (u == FLT_MAX)
-        return FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    if (u == FLT_MAX) return FLT_MAX;
+    for (int i = 0; i < value.size(); i++) {
        r += (value[i] - u) * (value[i] - u);
    }
    r = sqrt(r / value.size());
@@ -107,58 +83,42 @@ double ResultDatabase::Result::GetStdDev() const
 }


-void ResultDatabase::AddResults(const string &test,
-                                const string &atts,
-                                const string &unit,
-                                const vector<double> &values)
-{
-    for (int i=0; i<values.size(); i++)
-    {
+void ResultDatabase::AddResults(const string& test, const string& atts, const string& unit,
+                                const vector<double>& values) {
+    for (int i = 0; i < values.size(); i++) {
        AddResult(test, atts, unit, values[i]);
    }
 }

-static string RemoveAllButLeadingSpaces(const string &a)
-{
+static string RemoveAllButLeadingSpaces(const string& a) {
    string b;
    int n = a.length();
    int i = 0;
-    while (i<n && a[i] == ' ')
-    {
+    while (i < n && a[i] == ' ') {
        b += a[i];
        ++i;
    }
-    for (; i<n; i++)
-    {
-        if (a[i] != ' ' && a[i] != '\t')
-            b += a[i];
+    for (; i < n; i++) {
+        if (a[i] != ' ' && a[i] != '\t') b += a[i];
    }
    return b;
 }

-void ResultDatabase::AddResult(const string &test_orig,
-                               const string &atts_orig,
-                               const string &unit_orig,
-                               double value)
-{
+void ResultDatabase::AddResult(const string& test_orig, const string& atts_orig,
+                               const string& unit_orig, double value) {
    string test = RemoveAllButLeadingSpaces(test_orig);
    string atts = RemoveAllButLeadingSpaces(atts_orig);
    string unit = RemoveAllButLeadingSpaces(unit_orig);
    int index;
-    for (index = 0; index < results.size(); index++)
-    {
-        if (results[index].test == test &&
-            results[index].atts == atts)
-        {
-            if (results[index].unit != unit)
-                throw "Internal error: mixed units";
+    for (index = 0; index < results.size(); index++) {
+        if (results[index].test == test && results[index].atts == atts) {
+            if (results[index].unit != unit) throw "Internal error: mixed units";

            break;
        }
    }

-    if (index >= results.size())
-    {
+    if (index >= results.size()) {
        Result r;
        r.test = test;
        r.atts = atts;
@@ -192,41 +152,33 @@ void ResultDatabase::AddResult(const string &test_orig,
 //    Changed note about missing values to be worded a little better.
 //
 // ****************************************************************************
-void ResultDatabase::DumpDetailed(ostream &out)
-{
+void ResultDatabase::DumpDetailed(ostream& out) {
    vector<Result> sorted(results);

    stable_sort(sorted.begin(), sorted.end());

-    const int testNameW = 24 ;
+    const int testNameW = 24;
    const int attW = 12;
    const int fieldW = 11;
    out << std::fixed << right << std::setprecision(4);

    int maxtrials = 1;
-    for (int i=0; i<sorted.size(); i++)
-    {
-        if (sorted[i].value.size() > maxtrials)
-            maxtrials = sorted[i].value.size();
+    for (int i = 0; i < sorted.size(); i++) {
+        if (sorted[i].value.size() > maxtrials) maxtrials = sorted[i].value.size();
    }

    // TODO: in big parallel runs, the "trials" are the procs
    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t"  
-        << setw(attW) << "atts\t"
-        << setw(fieldW) 
-        << "median\t"
+    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "median\t"
        << "mean\t"
        << "stddev\t"
        << "min\t"
        << "max\t";
-    for (int i=0; i<maxtrials; i++)
-        out << "trial"<<i<<"\t";
+    for (int i = 0; i < maxtrials; i++) out << "trial" << i << "\t";
    out << endl;

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << setw(testNameW) << r.test + "\t";
        out << setw(attW) << r.atts + "\t";
        out << setw(fieldW) << r.unit + "\t";
@@ -237,7 +189,7 @@ void ResultDatabase::DumpDetailed(ostream &out)
        if (r.GetMean() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMean()   << "\t";
+            out << r.GetMean() << "\t";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A\t";
        else
@@ -245,13 +197,12 @@ void ResultDatabase::DumpDetailed(ostream &out)
        if (r.GetMin() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMin()    << "\t";
+            out << r.GetMin() << "\t";
        if (r.GetMax() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMax()    << "\t";
-        for (int j=0; j<r.value.size(); j++)
-        {
+            out << r.GetMax() << "\t";
+        for (int j = 0; j < r.value.size(); j++) {
            if (r.value[j] == FLT_MAX)
                out << "N/A\t";
            else
@@ -285,23 +236,19 @@ void ResultDatabase::DumpDetailed(ostream &out)
 //    Added note about (*) missing value tag.
 //
 // ****************************************************************************
-void ResultDatabase::DumpSummary(ostream &out)
-{
+void ResultDatabase::DumpSummary(ostream& out) {
    vector<Result> sorted(results);

    stable_sort(sorted.begin(), sorted.end());

-    const int testNameW = 24 ;
+    const int testNameW = 24;
    const int attW = 12;
    const int fieldW = 9;
    out << std::fixed << right << std::setprecision(4);

    // TODO: in big parallel runs, the "trials" are the procs
    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t"  
-        << setw(attW) << "atts\t"
-        << setw(fieldW) 
-        << "units\t"
+    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "units\t"
        << "median\t"
        << "mean\t"
        << "stddev\t"
@@ -309,9 +256,8 @@ void ResultDatabase::DumpSummary(ostream &out)
        << "max\t";
    out << endl;

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << setw(testNameW) << r.test + "\t";
        out << setw(attW) << r.atts + "\t";
        out << setw(fieldW) << r.unit + "\t";
@@ -322,7 +268,7 @@ void ResultDatabase::DumpSummary(ostream &out)
        if (r.GetMean() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMean()   << "\t";
+            out << r.GetMean() << "\t";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A\t";
        else
@@ -330,11 +276,11 @@ void ResultDatabase::DumpSummary(ostream &out)
        if (r.GetMin() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMin()    << "\t";
+            out << r.GetMin() << "\t";
        if (r.GetMax() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMax()    << "\t";
+            out << r.GetMax() << "\t";

        out << endl;
    }
@@ -359,10 +305,7 @@ void ResultDatabase::DumpSummary(ostream &out)
 //
 //
 // ****************************************************************************
-void ResultDatabase::ClearAllResults()
-{
-	results.clear();	
-}
+void ResultDatabase::ClearAllResults() { results.clear(); }

 // ****************************************************************************
 //  Method:  ResultDatabase::DumpCsv
@@ -380,39 +323,36 @@ void ResultDatabase::ClearAllResults()
 //  Modifications:
 //
 // ****************************************************************************
-void ResultDatabase::DumpCsv(string fileName)
-{
+void ResultDatabase::DumpCsv(string fileName) {
    bool emptyFile;
    vector<Result> sorted(results);

    stable_sort(sorted.begin(), sorted.end());

-    //Check to see if the file is empty - if so, add the headers
+    // Check to see if the file is empty - if so, add the headers
    emptyFile = this->IsFileEmpty(fileName);

-    //Open file and append by default
+    // Open file and append by default
    ofstream out;
-    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); 
+    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app);

-    //Add headers only for empty files
-    if(emptyFile)
-    {
-    // TODO: in big parallel runs, the "trials" are the procs
-    // and we really don't want to print them all out....
-    out << "test, "
-        << "atts, "
-        << "units, "
-        << "median, "
-        << "mean, "
-        << "stddev, "
-        << "min, "
-        << "max, ";
-    out << endl;
+    // Add headers only for empty files
+    if (emptyFile) {
+        // TODO: in big parallel runs, the "trials" are the procs
+        // and we really don't want to print them all out....
+        out << "test, "
+            << "atts, "
+            << "units, "
+            << "median, "
+            << "mean, "
+            << "stddev, "
+            << "min, "
+            << "max, ";
+        out << endl;
    }

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << r.test << ", ";
        out << r.atts << ", ";
        out << r.unit << ", ";
@@ -423,7 +363,7 @@ void ResultDatabase::DumpCsv(string fileName)
        if (r.GetMean() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMean()   << ", ";
+            out << r.GetMean() << ", ";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A, ";
        else
@@ -431,11 +371,11 @@ void ResultDatabase::DumpCsv(string fileName)
        if (r.GetMin() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMin()    << ", ";
+            out << r.GetMin() << ", ";
        if (r.GetMax() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMax()    << ", ";
+            out << r.GetMax() << ", ";

        out << endl;
    }
@@ -460,29 +400,24 @@ void ResultDatabase::DumpCsv(string fileName)
 //
 // ****************************************************************************

-bool ResultDatabase::IsFileEmpty(string fileName)
-{
-      bool fileEmpty;
+bool ResultDatabase::IsFileEmpty(string fileName) {
+    bool fileEmpty;

-      ifstream file(fileName.c_str());
+    ifstream file(fileName.c_str());

-      //If the file doesn't exist it is by definition empty
-      if(!file.good())
-      {
+    // If the file doesn't exist it is by definition empty
+    if (!file.good()) {
        return true;
-      }
-      else
-      {
+    } else {
        fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
        file.close();
-        
-	return fileEmpty;
-      }
-  
-      //Otherwise, return false  
-        return false;
-}

+        return fileEmpty;
+    }
+
+    // Otherwise, return false
+    return false;
+}


 // ****************************************************************************
@@ -500,16 +435,12 @@ bool ResultDatabase::IsFileEmpty(string fileName)
 //  Modifications:
 //
 // ****************************************************************************
-vector<ResultDatabase::Result>
-ResultDatabase::GetResultsForTest(const string &test)
-{
+vector<ResultDatabase::Result> ResultDatabase::GetResultsForTest(const string& test) {
    // get only the given test results
    vector<Result> retval;
-    for (int i=0; i<results.size(); i++)
-    {
-        Result &r = results[i];
-        if (r.test == test)
-            retval.push_back(r);
+    for (int i = 0; i < results.size(); i++) {
+        Result& r = results[i];
+        if (r.test == test) retval.push_back(r);
    }
    return retval;
 }
@@ -528,8 +459,4 @@ ResultDatabase::GetResultsForTest(const string &test)
 //  Modifications:
 //
 // ****************************************************************************
-const vector<ResultDatabase::Result> &
-ResultDatabase::GetResults() const
-{
-    return results;
-}
+const vector<ResultDatabase::Result>& ResultDatabase::GetResults() const { return results; }
@@ -6,11 +6,11 @@
 #include <iostream>
 #include <fstream>
 #include <cfloat>
+using std::ifstream;
+using std::ofstream;
+using std::ostream;
 using std::string;
 using std::vector;
-using std::ostream;
-using std::ofstream;
-using std::ifstream;


 // ****************************************************************************
@@ -40,18 +40,16 @@ using std::ifstream;
 //    Added a GetResults method as well, and made several functions const.
 //
 // ****************************************************************************
-class ResultDatabase
-{
-  public:
+class ResultDatabase {
+   public:
    //
    // A performance result for a single SHOC benchmark run.
    //
-    struct Result
-    {
-        string test;  // e.g. "readback"
-        string atts;  // e.g. "pagelocked 4k^2"
-        string unit;  // e.g. "MB/sec"
-        vector<double> value; // e.g. "837.14"
+    struct Result {
+        string test;           // e.g. "readback"
+        string atts;           // e.g. "pagelocked 4k^2"
+        string unit;           // e.g. "MB/sec"
+        vector<double> value;  // e.g. "837.14"
        double GetMin() const;
        double GetMax() const;
        double GetMedian() const;
@@ -59,41 +57,32 @@ class ResultDatabase
        double GetMean() const;
        double GetStdDev() const;

-        bool operator<(const Result &rhs) const;
+        bool operator<(const Result& rhs) const;

-        bool HadAnyFLTMAXValues() const
-        {
-            for (int i=0; i<value.size(); ++i)
-            {
-                if (value[i] >= FLT_MAX)
-                    return true;
+        bool HadAnyFLTMAXValues() const {
+            for (int i = 0; i < value.size(); ++i) {
+                if (value[i] >= FLT_MAX) return true;
            }
            return false;
        }
    };

-  protected:
+   protected:
    vector<Result> results;

-  public:
-    void AddResult(const string &test,
-                   const string &atts,
-                   const string &unit,
-                   double value);
-    void AddResults(const string &test,
-                    const string &atts,
-                    const string &unit,
-                    const vector<double> &values);
-    vector<Result>        GetResultsForTest(const string &test);
-    const vector<Result> &GetResults() const;
+   public:
+    void AddResult(const string& test, const string& atts, const string& unit, double value);
+    void AddResults(const string& test, const string& atts, const string& unit,
+                    const vector<double>& values);
+    vector<Result> GetResultsForTest(const string& test);
+    const vector<Result>& GetResults() const;
    void ClearAllResults();
    void DumpDetailed(ostream&);
    void DumpSummary(ostream&);
    void DumpCsv(string fileName);

-  private:
+   private:
    bool IsFileEmpty(string fileName);
-
 };


@@ -7,93 +7,69 @@

 using namespace std;

-bool ResultDatabase::Result::operator<(const Result &rhs) const
-{
-    if (test < rhs.test)
-        return true;
-    if (test > rhs.test)
-        return false;
-    if (atts < rhs.atts)
-        return true;
-    if (atts > rhs.atts)
-        return false;
-    return false; // less-operator returns false on equal
+bool ResultDatabase::Result::operator<(const Result& rhs) const {
+    if (test < rhs.test) return true;
+    if (test > rhs.test) return false;
+    if (atts < rhs.atts) return true;
+    if (atts > rhs.atts) return false;
+    return false;  // less-operator returns false on equal
 }

-double ResultDatabase::Result::GetMin() const
-{
+double ResultDatabase::Result::GetMin() const {
    double r = FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r = min(r, value[i]);
    }
    return r;
 }

-double ResultDatabase::Result::GetMax() const
-{
+double ResultDatabase::Result::GetMax() const {
    double r = -FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r = max(r, value[i]);
    }
    return r;
 }

-double ResultDatabase::Result::GetMedian() const
-{
-    return GetPercentile(50);
-}
+double ResultDatabase::Result::GetMedian() const { return GetPercentile(50); }

-double ResultDatabase::Result::GetPercentile(double q) const
-{
+double ResultDatabase::Result::GetPercentile(double q) const {
    int n = value.size();
-    if (n == 0)
-        return FLT_MAX;
-    if (n == 1)
-        return value[0];
+    if (n == 0) return FLT_MAX;
+    if (n == 1) return value[0];

-    if (q <= 0)
-        return value[0];
-    if (q >= 100)
-        return value[n-1];
+    if (q <= 0) return value[0];
+    if (q >= 100) return value[n - 1];

    double index = ((n + 1.) * q / 100.) - 1;

    vector<double> sorted = value;
    sort(sorted.begin(), sorted.end());

-    if (n == 2)
-        return (sorted[0] * (1 - q/100.)  +  sorted[1] * (q/100.));
+    if (n == 2) return (sorted[0] * (1 - q / 100.) + sorted[1] * (q / 100.));

    int index_lo = int(index);
    double frac = index - index_lo;
-    if (frac == 0)
-        return sorted[index_lo];
+    if (frac == 0) return sorted[index_lo];

    double lo = sorted[index_lo];
    double hi = sorted[index_lo + 1];
-    return lo + (hi-lo)*frac;
+    return lo + (hi - lo) * frac;
 }

-double ResultDatabase::Result::GetMean() const
-{
+double ResultDatabase::Result::GetMean() const {
    double r = 0;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r += value[i];
    }
    return r / double(value.size());
 }

-double ResultDatabase::Result::GetStdDev() const
-{
+double ResultDatabase::Result::GetStdDev() const {
    double r = 0;
    double u = GetMean();
-    if (u == FLT_MAX)
-        return FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    if (u == FLT_MAX) return FLT_MAX;
+    for (int i = 0; i < value.size(); i++) {
        r += (value[i] - u) * (value[i] - u);
    }
    r = sqrt(r / value.size());
@@ -101,58 +77,42 @@ double ResultDatabase::Result::GetStdDev() const
 }


-void ResultDatabase::AddResults(const string &test,
-                                const string &atts,
-                                const string &unit,
-                                const vector<double> &values)
-{
-    for (int i=0; i<values.size(); i++)
-    {
+void ResultDatabase::AddResults(const string& test, const string& atts, const string& unit,
+                                const vector<double>& values) {
+    for (int i = 0; i < values.size(); i++) {
        AddResult(test, atts, unit, values[i]);
    }
 }

-static string RemoveAllButLeadingSpaces(const string &a)
-{
+static string RemoveAllButLeadingSpaces(const string& a) {
    string b;
    int n = a.length();
    int i = 0;
-    while (i<n && a[i] == ' ')
-    {
+    while (i < n && a[i] == ' ') {
        b += a[i];
        ++i;
    }
-    for (; i<n; i++)
-    {
-        if (a[i] != ' ' && a[i] != '\t')
-            b += a[i];
+    for (; i < n; i++) {
+        if (a[i] != ' ' && a[i] != '\t') b += a[i];
    }
    return b;
 }

-void ResultDatabase::AddResult(const string &test_orig,
-                               const string &atts_orig,
-                               const string &unit_orig,
-                               double value)
-{
+void ResultDatabase::AddResult(const string& test_orig, const string& atts_orig,
+                               const string& unit_orig, double value) {
    string test = RemoveAllButLeadingSpaces(test_orig);
    string atts = RemoveAllButLeadingSpaces(atts_orig);
    string unit = RemoveAllButLeadingSpaces(unit_orig);
    int index;
-    for (index = 0; index < results.size(); index++)
-    {
-        if (results[index].test == test &&
-            results[index].atts == atts)
-        {
-            if (results[index].unit != unit)
-                throw "Internal error: mixed units";
+    for (index = 0; index < results.size(); index++) {
+        if (results[index].test == test && results[index].atts == atts) {
+            if (results[index].unit != unit) throw "Internal error: mixed units";

            break;
        }
    }

-    if (index >= results.size())
-    {
+    if (index >= results.size()) {
        Result r;
        r.test = test;
        r.atts = atts;
@@ -186,40 +146,32 @@ void ResultDatabase::AddResult(const string &test_orig,
 //    Changed note about missing values to be worded a little better.
 //
 // ****************************************************************************
-void ResultDatabase::DumpDetailed(ostream &out)
-{
+void ResultDatabase::DumpDetailed(ostream& out) {
    vector<Result> sorted(results);
    sort(sorted.begin(), sorted.end());

-    const int testNameW = 24 ;
+    const int testNameW = 24;
    const int attW = 12;
    const int fieldW = 11;
    out << std::fixed << right << std::setprecision(4);

    int maxtrials = 1;
-    for (int i=0; i<sorted.size(); i++)
-    {
-        if (sorted[i].value.size() > maxtrials)
-            maxtrials = sorted[i].value.size();
+    for (int i = 0; i < sorted.size(); i++) {
+        if (sorted[i].value.size() > maxtrials) maxtrials = sorted[i].value.size();
    }

    // TODO: in big parallel runs, the "trials" are the procs
    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t"  
-        << setw(attW) << "atts\t"
-        << setw(fieldW) 
-        << "median\t"
+    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "median\t"
        << "mean\t"
        << "stddev\t"
        << "min\t"
        << "max\t";
-    for (int i=0; i<maxtrials; i++)
-        out << "trial"<<i<<"\t";
+    for (int i = 0; i < maxtrials; i++) out << "trial" << i << "\t";
    out << endl;

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << setw(testNameW) << r.test + "\t";
        out << setw(attW) << r.atts + "\t";
        out << setw(fieldW) << r.unit + "\t";
@@ -230,7 +182,7 @@ void ResultDatabase::DumpDetailed(ostream &out)
        if (r.GetMean() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMean()   << "\t";
+            out << r.GetMean() << "\t";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A\t";
        else
@@ -238,13 +190,12 @@ void ResultDatabase::DumpDetailed(ostream &out)
        if (r.GetMin() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMin()    << "\t";
+            out << r.GetMin() << "\t";
        if (r.GetMax() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMax()    << "\t";
-        for (int j=0; j<r.value.size(); j++)
-        {
+            out << r.GetMax() << "\t";
+        for (int j = 0; j < r.value.size(); j++) {
            if (r.value[j] == FLT_MAX)
                out << "N/A\t";
            else
@@ -278,22 +229,18 @@ void ResultDatabase::DumpDetailed(ostream &out)
 //    Added note about (*) missing value tag.
 //
 // ****************************************************************************
-void ResultDatabase::DumpSummary(ostream &out)
-{
+void ResultDatabase::DumpSummary(ostream& out) {
    vector<Result> sorted(results);
    sort(sorted.begin(), sorted.end());

-    const int testNameW = 24 ;
+    const int testNameW = 24;
    const int attW = 12;
    const int fieldW = 9;
    out << std::fixed << right << std::setprecision(4);

    // TODO: in big parallel runs, the "trials" are the procs
    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t"  
-        << setw(attW) << "atts\t"
-        << setw(fieldW) 
-        << "units\t"
+    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "units\t"
        << "median\t"
        << "mean\t"
        << "stddev\t"
@@ -301,9 +248,8 @@ void ResultDatabase::DumpSummary(ostream &out)
        << "max\t";
    out << endl;

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << setw(testNameW) << r.test + "\t";
        out << setw(attW) << r.atts + "\t";
        out << setw(fieldW) << r.unit + "\t";
@@ -314,7 +260,7 @@ void ResultDatabase::DumpSummary(ostream &out)
        if (r.GetMean() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMean()   << "\t";
+            out << r.GetMean() << "\t";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A\t";
        else
@@ -322,11 +268,11 @@ void ResultDatabase::DumpSummary(ostream &out)
        if (r.GetMin() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMin()    << "\t";
+            out << r.GetMin() << "\t";
        if (r.GetMax() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMax()    << "\t";
+            out << r.GetMax() << "\t";

        out << endl;
    }
@@ -351,10 +297,7 @@ void ResultDatabase::DumpSummary(ostream &out)
 //
 //
 // ****************************************************************************
-void ResultDatabase::ClearAllResults()
-{
-	results.clear();	
-}
+void ResultDatabase::ClearAllResults() { results.clear(); }

 // ****************************************************************************
 //  Method:  ResultDatabase::DumpCsv
@@ -372,39 +315,36 @@ void ResultDatabase::ClearAllResults()
 //  Modifications:
 //
 // ****************************************************************************
-void ResultDatabase::DumpCsv(string fileName)
-{
+void ResultDatabase::DumpCsv(string fileName) {
    bool emptyFile;
    vector<Result> sorted(results);

    sort(sorted.begin(), sorted.end());

-    //Check to see if the file is empty - if so, add the headers
+    // Check to see if the file is empty - if so, add the headers
    emptyFile = this->IsFileEmpty(fileName);

-    //Open file and append by default
+    // Open file and append by default
    ofstream out;
-    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); 
+    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app);

-    //Add headers only for empty files
-    if(emptyFile)
-    {
-    // TODO: in big parallel runs, the "trials" are the procs
-    // and we really don't want to print them all out....
-    out << "test, "
-        << "atts, "
-        << "units, "
-        << "median, "
-        << "mean, "
-        << "stddev, "
-        << "min, "
-        << "max, ";
-    out << endl;
+    // Add headers only for empty files
+    if (emptyFile) {
+        // TODO: in big parallel runs, the "trials" are the procs
+        // and we really don't want to print them all out....
+        out << "test, "
+            << "atts, "
+            << "units, "
+            << "median, "
+            << "mean, "
+            << "stddev, "
+            << "min, "
+            << "max, ";
+        out << endl;
    }

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << r.test << ", ";
        out << r.atts << ", ";
        out << r.unit << ", ";
@@ -415,7 +355,7 @@ void ResultDatabase::DumpCsv(string fileName)
        if (r.GetMean() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMean()   << ", ";
+            out << r.GetMean() << ", ";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A, ";
        else
@@ -423,11 +363,11 @@ void ResultDatabase::DumpCsv(string fileName)
        if (r.GetMin() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMin()    << ", ";
+            out << r.GetMin() << ", ";
        if (r.GetMax() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMax()    << ", ";
+            out << r.GetMax() << ", ";

        out << endl;
    }
@@ -452,29 +392,24 @@ void ResultDatabase::DumpCsv(string fileName)
 //
 // ****************************************************************************

-bool ResultDatabase::IsFileEmpty(string fileName)
-{
-      bool fileEmpty;
+bool ResultDatabase::IsFileEmpty(string fileName) {
+    bool fileEmpty;

-      ifstream file(fileName.c_str());
+    ifstream file(fileName.c_str());

-      //If the file doesn't exist it is by definition empty
-      if(!file.good())
-      {
+    // If the file doesn't exist it is by definition empty
+    if (!file.good()) {
        return true;
-      }
-      else
-      {
+    } else {
        fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
        file.close();
-        
-	return fileEmpty;
-      }
-  
-      //Otherwise, return false  
-        return false;
-}

+        return fileEmpty;
+    }
+
+    // Otherwise, return false
+    return false;
+}


 // ****************************************************************************
@@ -492,16 +427,12 @@ bool ResultDatabase::IsFileEmpty(string fileName)
 //  Modifications:
 //
 // ****************************************************************************
-vector<ResultDatabase::Result>
-ResultDatabase::GetResultsForTest(const string &test)
-{
+vector<ResultDatabase::Result> ResultDatabase::GetResultsForTest(const string& test) {
    // get only the given test results
    vector<Result> retval;
-    for (int i=0; i<results.size(); i++)
-    {
-        Result &r = results[i];
-        if (r.test == test)
-            retval.push_back(r);
+    for (int i = 0; i < results.size(); i++) {
+        Result& r = results[i];
+        if (r.test == test) retval.push_back(r);
    }
    return retval;
 }
@@ -520,8 +451,4 @@ ResultDatabase::GetResultsForTest(const string &test)
 //  Modifications:
 //
 // ****************************************************************************
-const vector<ResultDatabase::Result> &
-ResultDatabase::GetResults() const
-{
-    return results;
-}
+const vector<ResultDatabase::Result>& ResultDatabase::GetResults() const { return results; }
@@ -6,11 +6,11 @@
 #include <iostream>
 #include <fstream>
 #include <cfloat>
+using std::ifstream;
+using std::ofstream;
+using std::ostream;
 using std::string;
 using std::vector;
-using std::ostream;
-using std::ofstream;
-using std::ifstream;


 // ****************************************************************************
@@ -40,18 +40,16 @@ using std::ifstream;
 //    Added a GetResults method as well, and made several functions const.
 //
 // ****************************************************************************
-class ResultDatabase
-{
-  public:
+class ResultDatabase {
+   public:
    //
    // A performance result for a single SHOC benchmark run.
    //
-    struct Result
-    {
-        string test;  // e.g. "readback"
-        string atts;  // e.g. "pagelocked 4k^2"
-        string unit;  // e.g. "MB/sec"
-        vector<double> value; // e.g. "837.14"
+    struct Result {
+        string test;           // e.g. "readback"
+        string atts;           // e.g. "pagelocked 4k^2"
+        string unit;           // e.g. "MB/sec"
+        vector<double> value;  // e.g. "837.14"
        double GetMin() const;
        double GetMax() const;
        double GetMedian() const;
@@ -59,41 +57,32 @@ class ResultDatabase
        double GetMean() const;
        double GetStdDev() const;

-        bool operator<(const Result &rhs) const;
+        bool operator<(const Result& rhs) const;

-        bool HadAnyFLTMAXValues() const
-        {
-            for (int i=0; i<value.size(); ++i)
-            {
-                if (value[i] >= FLT_MAX)
-                    return true;
+        bool HadAnyFLTMAXValues() const {
+            for (int i = 0; i < value.size(); ++i) {
+                if (value[i] >= FLT_MAX) return true;
            }
            return false;
        }
    };

-  protected:
+   protected:
    vector<Result> results;

-  public:
-    void AddResult(const string &test,
-                   const string &atts,
-                   const string &unit,
-                   double value);
-    void AddResults(const string &test,
-                    const string &atts,
-                    const string &unit,
-                    const vector<double> &values);
-    vector<Result>        GetResultsForTest(const string &test);
-    const vector<Result> &GetResults() const;
+   public:
+    void AddResult(const string& test, const string& atts, const string& unit, double value);
+    void AddResults(const string& test, const string& atts, const string& unit,
+                    const vector<double>& values);
+    vector<Result> GetResultsForTest(const string& test);
+    const vector<Result>& GetResults() const;
    void ClearAllResults();
    void DumpDetailed(ostream&);
    void DumpSummary(ostream&);
    void DumpCsv(string fileName);

-  private:
+   private:
    bool IsFileEmpty(string fileName);
-
 };


@@ -1,6 +1,6 @@
 #include "hip/hip_runtime.h"

-extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad){
+extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -1,20 +1,17 @@
 #include <hip/hip_runtime.h>

-static const int BLOCKSIZEX=32;
-static const int BLOCKSIZEY=16;
+static const int BLOCKSIZEX = 32;
+static const int BLOCKSIZEY = 16;

-__global__ void fails(hipLaunchParm lp, float* pErrorI)
-{
-    if(pErrorI!=0)
-    {
-        pErrorI[0]=1;
+__global__ void fails(hipLaunchParm lp, float* pErrorI) {
+    if (pErrorI != 0) {
+        pErrorI[0] = 1;
    }
 }

-int main()
-{
-    dim3 blocks(1,1);
-    dim3 threads(BLOCKSIZEX,BLOCKSIZEY);
+int main() {
+    dim3 blocks(1, 1);
+    dim3 threads(BLOCKSIZEX, BLOCKSIZEY);
    float error;

    hipLaunchKernel(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
@@ -11,96 +11,72 @@ using namespace std;
 #define SORT_RETAIN_ATTS_ORDER 1


-bool ResultDatabase::Result::operator<(const Result &rhs) const
-{
-    if (test < rhs.test)
-        return true;
-    if (test > rhs.test)
-        return false;
-#if (SORT_RETAIN_ATTS_ORDER == 0) 
+bool ResultDatabase::Result::operator<(const Result& rhs) const {
+    if (test < rhs.test) return true;
+    if (test > rhs.test) return false;
+#if (SORT_RETAIN_ATTS_ORDER == 0)
    // For ties, sort by the value of the attribute:
-    if (atts < rhs.atts)
-        return true;
-    if (atts > rhs.atts)
-        return false;
+    if (atts < rhs.atts) return true;
+    if (atts > rhs.atts) return false;
 #endif
-    return false; // less-operator returns false on equal
+    return false;  // less-operator returns false on equal
 }

-double ResultDatabase::Result::GetMin() const
-{
+double ResultDatabase::Result::GetMin() const {
    double r = FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r = min(r, value[i]);
    }
    return r;
 }

-double ResultDatabase::Result::GetMax() const
-{
+double ResultDatabase::Result::GetMax() const {
    double r = -FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r = max(r, value[i]);
    }
    return r;
 }

-double ResultDatabase::Result::GetMedian() const
-{
-    return GetPercentile(50);
-}
+double ResultDatabase::Result::GetMedian() const { return GetPercentile(50); }

-double ResultDatabase::Result::GetPercentile(double q) const
-{
+double ResultDatabase::Result::GetPercentile(double q) const {
    int n = value.size();
-    if (n == 0)
-        return FLT_MAX;
-    if (n == 1)
-        return value[0];
+    if (n == 0) return FLT_MAX;
+    if (n == 1) return value[0];

-    if (q <= 0)
-        return value[0];
-    if (q >= 100)
-        return value[n-1];
+    if (q <= 0) return value[0];
+    if (q >= 100) return value[n - 1];

    double index = ((n + 1.) * q / 100.) - 1;

    vector<double> sorted = value;
    sort(sorted.begin(), sorted.end());

-    if (n == 2)
-        return (sorted[0] * (1 - q/100.)  +  sorted[1] * (q/100.));
+    if (n == 2) return (sorted[0] * (1 - q / 100.) + sorted[1] * (q / 100.));

    int index_lo = int(index);
    double frac = index - index_lo;
-    if (frac == 0)
-        return sorted[index_lo];
+    if (frac == 0) return sorted[index_lo];

    double lo = sorted[index_lo];
    double hi = sorted[index_lo + 1];
-    return lo + (hi-lo)*frac;
+    return lo + (hi - lo) * frac;
 }

-double ResultDatabase::Result::GetMean() const
-{
+double ResultDatabase::Result::GetMean() const {
    double r = 0;
-    for (int i=0; i<value.size(); i++)
-    {
+    for (int i = 0; i < value.size(); i++) {
        r += value[i];
    }
    return r / double(value.size());
 }

-double ResultDatabase::Result::GetStdDev() const
-{
+double ResultDatabase::Result::GetStdDev() const {
    double r = 0;
    double u = GetMean();
-    if (u == FLT_MAX)
-        return FLT_MAX;
-    for (int i=0; i<value.size(); i++)
-    {
+    if (u == FLT_MAX) return FLT_MAX;
+    for (int i = 0; i < value.size(); i++) {
        r += (value[i] - u) * (value[i] - u);
    }
    r = sqrt(r / value.size());
@@ -108,58 +84,42 @@ double ResultDatabase::Result::GetStdDev() const
 }


-void ResultDatabase::AddResults(const string &test,
-                                const string &atts,
-                                const string &unit,
-                                const vector<double> &values)
-{
-    for (int i=0; i<values.size(); i++)
-    {
+void ResultDatabase::AddResults(const string& test, const string& atts, const string& unit,
+                                const vector<double>& values) {
+    for (int i = 0; i < values.size(); i++) {
        AddResult(test, atts, unit, values[i]);
    }
 }

-static string RemoveAllButLeadingSpaces(const string &a)
-{
+static string RemoveAllButLeadingSpaces(const string& a) {
    string b;
    int n = a.length();
    int i = 0;
-    while (i<n && a[i] == ' ')
-    {
+    while (i < n && a[i] == ' ') {
        b += a[i];
        ++i;
    }
-    for (; i<n; i++)
-    {
-        if (a[i] != ' ' && a[i] != '\t')
-            b += a[i];
+    for (; i < n; i++) {
+        if (a[i] != ' ' && a[i] != '\t') b += a[i];
    }
    return b;
 }

-void ResultDatabase::AddResult(const string &test_orig,
-                               const string &atts_orig,
-                               const string &unit_orig,
-                               double value)
-{
+void ResultDatabase::AddResult(const string& test_orig, const string& atts_orig,
+                               const string& unit_orig, double value) {
    string test = RemoveAllButLeadingSpaces(test_orig);
    string atts = RemoveAllButLeadingSpaces(atts_orig);
    string unit = RemoveAllButLeadingSpaces(unit_orig);
    int index;
-    for (index = 0; index < results.size(); index++)
-    {
-        if (results[index].test == test &&
-            results[index].atts == atts)
-        {
-            if (results[index].unit != unit)
-                throw "Internal error: mixed units";
+    for (index = 0; index < results.size(); index++) {
+        if (results[index].test == test && results[index].atts == atts) {
+            if (results[index].unit != unit) throw "Internal error: mixed units";

            break;
        }
    }

-    if (index >= results.size())
-    {
+    if (index >= results.size()) {
        Result r;
        r.test = test;
        r.atts = atts;
@@ -193,43 +153,35 @@ void ResultDatabase::AddResult(const string &test_orig,
 //    Changed note about missing values to be worded a little better.
 //
 // ****************************************************************************
-void ResultDatabase::DumpDetailed(ostream &out)
-{
+void ResultDatabase::DumpDetailed(ostream& out) {
    vector<Result> sorted(results);

 #if SORT_BY_NAME
    stable_sort(sorted.begin(), sorted.end());
 #endif

-    const int testNameW = 24 ;
+    const int testNameW = 24;
    const int attW = 12;
    const int fieldW = 11;
    out << std::fixed << right << std::setprecision(4);

    int maxtrials = 1;
-    for (int i=0; i<sorted.size(); i++)
-    {
-        if (sorted[i].value.size() > maxtrials)
-            maxtrials = sorted[i].value.size();
+    for (int i = 0; i < sorted.size(); i++) {
+        if (sorted[i].value.size() > maxtrials) maxtrials = sorted[i].value.size();
    }

    // TODO: in big parallel runs, the "trials" are the procs
    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t"  
-        << setw(attW) << "atts\t"
-        << setw(fieldW) 
-        << "median\t"
+    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "median\t"
        << "mean\t"
        << "stddev\t"
        << "min\t"
        << "max\t";
-    for (int i=0; i<maxtrials; i++)
-        out << "trial"<<i<<"\t";
+    for (int i = 0; i < maxtrials; i++) out << "trial" << i << "\t";
    out << endl;

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << setw(testNameW) << r.test + "\t";
        out << setw(attW) << r.atts + "\t";
        out << setw(fieldW) << r.unit + "\t";
@@ -240,7 +192,7 @@ void ResultDatabase::DumpDetailed(ostream &out)
        if (r.GetMean() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMean()   << "\t";
+            out << r.GetMean() << "\t";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A\t";
        else
@@ -248,13 +200,12 @@ void ResultDatabase::DumpDetailed(ostream &out)
        if (r.GetMin() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMin()    << "\t";
+            out << r.GetMin() << "\t";
        if (r.GetMax() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMax()    << "\t";
-        for (int j=0; j<r.value.size(); j++)
-        {
+            out << r.GetMax() << "\t";
+        for (int j = 0; j < r.value.size(); j++) {
            if (r.value[j] == FLT_MAX)
                out << "N/A\t";
            else
@@ -290,25 +241,21 @@ void ResultDatabase::DumpDetailed(ostream &out)
 //    Added note about (*) missing value tag.
 //
 // ****************************************************************************
-void ResultDatabase::DumpSummary(ostream &out)
-{
+void ResultDatabase::DumpSummary(ostream& out) {
    vector<Result> sorted(results);

 #if SORT_BY_NAME
    stable_sort(sorted.begin(), sorted.end());
 #endif

-    const int testNameW = 32 ;
+    const int testNameW = 32;
    const int attW = 12;
    const int fieldW = 9;
    out << std::fixed << right << std::setprecision(2);

    // TODO: in big parallel runs, the "trials" are the procs
    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t"  
-        << setw(attW) << "atts\t"
-        << setw(fieldW) 
-        << "units\t"
+    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "units\t"
        << "median\t"
        << "mean\t"
        << "stddev\t"
@@ -316,9 +263,8 @@ void ResultDatabase::DumpSummary(ostream &out)
        << "max\t";
    out << endl;

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << setw(testNameW) << r.test + "\t";
        out << setw(attW) << r.atts + "\t";
        out << setw(fieldW) << r.unit + "\t";
@@ -329,7 +275,7 @@ void ResultDatabase::DumpSummary(ostream &out)
        if (r.GetMean() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMean()   << "\t";
+            out << r.GetMean() << "\t";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A\t";
        else
@@ -337,11 +283,11 @@ void ResultDatabase::DumpSummary(ostream &out)
        if (r.GetMin() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMin()    << "\t";
+            out << r.GetMin() << "\t";
        if (r.GetMax() == FLT_MAX)
            out << "N/A\t";
        else
-            out << r.GetMax()    << "\t";
+            out << r.GetMax() << "\t";

        out << endl;
    }
@@ -368,10 +314,7 @@ void ResultDatabase::DumpSummary(ostream &out)
 //
 //
 // ****************************************************************************
-void ResultDatabase::ClearAllResults()
-{
-	results.clear();	
-}
+void ResultDatabase::ClearAllResults() { results.clear(); }

 // ****************************************************************************
 //  Method:  ResultDatabase::DumpCsv
@@ -389,8 +332,7 @@ void ResultDatabase::ClearAllResults()
 //  Modifications:
 //
 // ****************************************************************************
-void ResultDatabase::DumpCsv(string fileName)
-{
+void ResultDatabase::DumpCsv(string fileName) {
    bool emptyFile;
    vector<Result> sorted(results);

@@ -398,32 +340,30 @@ void ResultDatabase::DumpCsv(string fileName)
    stable_sort(sorted.begin(), sorted.end());
 #endif

-    //Check to see if the file is empty - if so, add the headers
+    // Check to see if the file is empty - if so, add the headers
    emptyFile = this->IsFileEmpty(fileName);

-    //Open file and append by default
+    // Open file and append by default
    ofstream out;
-    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); 
+    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app);

-    //Add headers only for empty files
-    if(emptyFile)
-    {
-    // TODO: in big parallel runs, the "trials" are the procs
-    // and we really don't want to print them all out....
-    out << "test, "
-        << "atts, "
-        << "units, "
-        << "median, "
-        << "mean, "
-        << "stddev, "
-        << "min, "
-        << "max, ";
-    out << endl;
+    // Add headers only for empty files
+    if (emptyFile) {
+        // TODO: in big parallel runs, the "trials" are the procs
+        // and we really don't want to print them all out....
+        out << "test, "
+            << "atts, "
+            << "units, "
+            << "median, "
+            << "mean, "
+            << "stddev, "
+            << "min, "
+            << "max, ";
+        out << endl;
    }

-    for (int i=0; i<sorted.size(); i++)
-    {
-        Result &r = sorted[i];
+    for (int i = 0; i < sorted.size(); i++) {
+        Result& r = sorted[i];
        out << r.test << ", ";
        out << r.atts << ", ";
        out << r.unit << ", ";
@@ -434,7 +374,7 @@ void ResultDatabase::DumpCsv(string fileName)
        if (r.GetMean() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMean()   << ", ";
+            out << r.GetMean() << ", ";
        if (r.GetStdDev() == FLT_MAX)
            out << "N/A, ";
        else
@@ -442,11 +382,11 @@ void ResultDatabase::DumpCsv(string fileName)
        if (r.GetMin() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMin()    << ", ";
+            out << r.GetMin() << ", ";
        if (r.GetMax() == FLT_MAX)
            out << "N/A, ";
        else
-            out << r.GetMax()    << ", ";
+            out << r.GetMax() << ", ";

        out << endl;
    }
@@ -471,29 +411,24 @@ void ResultDatabase::DumpCsv(string fileName)
 //
 // ****************************************************************************

-bool ResultDatabase::IsFileEmpty(string fileName)
-{
-      bool fileEmpty;
+bool ResultDatabase::IsFileEmpty(string fileName) {
+    bool fileEmpty;

-      ifstream file(fileName.c_str());
+    ifstream file(fileName.c_str());

-      //If the file doesn't exist it is by definition empty
-      if(!file.good())
-      {
+    // If the file doesn't exist it is by definition empty
+    if (!file.good()) {
        return true;
-      }
-      else
-      {
+    } else {
        fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
        file.close();
-        
-	return fileEmpty;
-      }
-  
-      //Otherwise, return false  
-        return false;
-}

+        return fileEmpty;
+    }
+
+    // Otherwise, return false
+    return false;
+}


 // ****************************************************************************
@@ -511,16 +446,12 @@ bool ResultDatabase::IsFileEmpty(string fileName)
 //  Modifications:
 //
 // ****************************************************************************
-vector<ResultDatabase::Result>
-ResultDatabase::GetResultsForTest(const string &test)
-{
+vector<ResultDatabase::Result> ResultDatabase::GetResultsForTest(const string& test) {
    // get only the given test results
    vector<Result> retval;
-    for (int i=0; i<results.size(); i++)
-    {
-        Result &r = results[i];
-        if (r.test == test)
-            retval.push_back(r);
+    for (int i = 0; i < results.size(); i++) {
+        Result& r = results[i];
+        if (r.test == test) retval.push_back(r);
    }
    return retval;
 }
@@ -539,8 +470,4 @@ ResultDatabase::GetResultsForTest(const string &test)
 //  Modifications:
 //
 // ****************************************************************************
-const vector<ResultDatabase::Result> &
-ResultDatabase::GetResults() const
-{
-    return results;
-}
+const vector<ResultDatabase::Result>& ResultDatabase::GetResults() const { return results; }
@@ -6,11 +6,11 @@
 #include <iostream>
 #include <fstream>
 #include <cfloat>
+using std::ifstream;
+using std::ofstream;
+using std::ostream;
 using std::string;
 using std::vector;
-using std::ostream;
-using std::ofstream;
-using std::ifstream;


 // ****************************************************************************
@@ -40,18 +40,16 @@ using std::ifstream;
 //    Added a GetResults method as well, and made several functions const.
 //
 // ****************************************************************************
-class ResultDatabase
-{
-  public:
+class ResultDatabase {
+   public:
    //
    // A performance result for a single SHOC benchmark run.
    //
-    struct Result
-    {
-        string test;  // e.g. "readback"
-        string atts;  // e.g. "pagelocked 4k^2"
-        string unit;  // e.g. "MB/sec"
-        vector<double> value; // e.g. "837.14"
+    struct Result {
+        string test;           // e.g. "readback"
+        string atts;           // e.g. "pagelocked 4k^2"
+        string unit;           // e.g. "MB/sec"
+        vector<double> value;  // e.g. "837.14"
        double GetMin() const;
        double GetMax() const;
        double GetMedian() const;
@@ -59,41 +57,32 @@ class ResultDatabase
        double GetMean() const;
        double GetStdDev() const;

-        bool operator<(const Result &rhs) const;
+        bool operator<(const Result& rhs) const;

-        bool HadAnyFLTMAXValues() const
-        {
-            for (int i=0; i<value.size(); ++i)
-            {
-                if (value[i] >= FLT_MAX)
-                    return true;
+        bool HadAnyFLTMAXValues() const {
+            for (int i = 0; i < value.size(); ++i) {
+                if (value[i] >= FLT_MAX) return true;
            }
            return false;
        }
    };

-  protected:
+   protected:
    vector<Result> results;

-  public:
-    void AddResult(const string &test,
-                   const string &atts,
-                   const string &unit,
-                   double value);
-    void AddResults(const string &test,
-                    const string &atts,
-                    const string &unit,
-                    const vector<double> &values);
-    vector<Result>        GetResultsForTest(const string &test);
-    const vector<Result> &GetResults() const;
+   public:
+    void AddResult(const string& test, const string& atts, const string& unit, double value);
+    void AddResults(const string& test, const string& atts, const string& unit,
+                    const vector<double>& values);
+    vector<Result> GetResultsForTest(const string& test);
+    const vector<Result>& GetResults() const;
    void ClearAllResults();
    void DumpDetailed(ostream&);
    void DumpSummary(ostream&);
    void DumpCsv(string fileName);

-  private:
+   private:
    bool IsFileEmpty(string fileName);
-
 };


@@ -21,35 +21,34 @@ THE SOFTWARE.
 */

 #include "hip/hip_runtime.h"
-#include<iostream>
-#include<time.h>
-#include"ResultDatabase.h"
+#include <iostream>
+#include <time.h>
+#include "ResultDatabase.h"

 #define PRINT_PROGRESS 0

-#define check(cmd) \
-{\
-  hipError_t status = cmd;\
-  if(status != hipSuccess){ \
-    printf("error: '%s'(%d) from %s at %s:%d\n", \
-          hipGetErrorString(status), status, #cmd,\
-          __FILE__, __LINE__); \
-	abort(); \
-  }\
-}
+#define check(cmd)                                                                                 \
+    {                                                                                              \
+        hipError_t status = cmd;                                                                   \
+        if (status != hipSuccess) {                                                                \
+            printf("error: '%s'(%d) from %s at %s:%d\n", hipGetErrorString(status), status, #cmd,  \
+                   __FILE__, __LINE__);                                                            \
+            abort();                                                                               \
+        }                                                                                          \
+    }

-#define LEN 1024*1024
+#define LEN 1024 * 1024

 #define NUM_GROUPS 1
 #define GROUP_SIZE 64
-#define TEST_ITERS 20          
+#define TEST_ITERS 20
 #define DISPATCHES_PER_TEST 100

 const unsigned p_tests = 0xfffffff;


 // HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(hipLaunchParm lp, float* Ad){
+__global__ void NearlyNull(hipLaunchParm lp, float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -59,38 +58,35 @@ __global__ void NearlyNull(hipLaunchParm lp, float* Ad){
 ResultDatabase resultDB;


-void stopTest(hipEvent_t start, hipEvent_t stop, const char *msg, int iters)
-{
-	float mS = 0;
+void stopTest(hipEvent_t start, hipEvent_t stop, const char* msg, int iters) {
+    float mS = 0;
    check(hipEventRecord(stop));
    check(hipDeviceSynchronize());
    check(hipEventElapsedTime(&mS, start, stop));
-    resultDB.AddResult(std::string(msg), "", "uS", mS*1000/iters); 
-    if (PRINT_PROGRESS & 0x1 ) {
-        std::cout<< msg <<"\t\t"<<mS*1000/iters<<" uS"<<std::endl;
+    resultDB.AddResult(std::string(msg), "", "uS", mS * 1000 / iters);
+    if (PRINT_PROGRESS & 0x1) {
+        std::cout << msg << "\t\t" << mS * 1000 / iters << " uS" << std::endl;
    }
-    if (PRINT_PROGRESS & 0x2 ) {
+    if (PRINT_PROGRESS & 0x2) {
        resultDB.DumpSummary(std::cout);
    }
 }


-int main(){
-
-	hipError_t err;
-	float *Ad;
+int main() {
+    hipError_t err;
+    float* Ad;
    check(hipMalloc(&Ad, 4));


-	hipStream_t stream;
-	check(hipStreamCreate(&stream));
+    hipStream_t stream;
+    check(hipStreamCreate(&stream));


-	hipEvent_t start, sync, stop;
-	check(hipEventCreate(&start));
-	check(hipEventCreateWithFlags(&sync, hipEventBlockingSync));
-	check(hipEventCreate(&stop));
-
+    hipEvent_t start, sync, stop;
+    check(hipEventCreate(&start));
+    check(hipEventCreateWithFlags(&sync, hipEventBlockingSync));
+    check(hipEventCreate(&stop));


    hipStream_t stream0 = 0;
@@ -103,7 +99,6 @@ int main(){
    }


-
    if (p_tests & 0x2) {
        hipEventRecord(start);
        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
@@ -112,9 +107,9 @@ int main(){


    if (p_tests & 0x4) {
-        for (int t=0; t<TEST_ITERS; t++)  {
+        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
-            for(int i=0;i<DISPATCHES_PER_TEST;i++){
+            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
@@ -125,9 +120,9 @@ int main(){


    if (p_tests & 0x10) {
-        for (int t=0; t<TEST_ITERS; t++)  {
+        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
-            for(int i=0;i<DISPATCHES_PER_TEST;i++){
+            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
@@ -139,9 +134,9 @@ int main(){
 #if 1

    if (p_tests & 0x40) {
-        for (int t=0; t<TEST_ITERS; t++)  {
+        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
-            for(int i=0;i<DISPATCHES_PER_TEST;i++){
+            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
            }
            stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
@@ -149,9 +144,9 @@ int main(){
    }

    if (p_tests & 0x80) {
-        for (int t=0; t<TEST_ITERS; t++)  {
+        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
-            for(int i=0;i<DISPATCHES_PER_TEST;i++){
+            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
            }
            stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
@@ -161,7 +156,7 @@ int main(){
    resultDB.DumpSummary(std::cout);


-	check(hipEventDestroy(start));
-	check(hipEventDestroy(sync));
-	check(hipEventDestroy(stop));
+    check(hipEventDestroy(start));
+    check(hipEventDestroy(sync));
+    check(hipEventDestroy(stop));
 }
@@ -24,61 +24,57 @@ THE SOFTWARE.
 #include <iomanip>
 #include "hip/hip_runtime.h"

-#define KNRM  "\x1B[0m"
-#define KRED  "\x1B[31m"
-#define KGRN  "\x1B[32m"
-#define KYEL  "\x1B[33m"
-#define KBLU  "\x1B[34m"
-#define KMAG  "\x1B[35m"
-#define KCYN  "\x1B[36m"
-#define KWHT  "\x1B[37m"
+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"
+#define KGRN "\x1B[32m"
+#define KYEL "\x1B[33m"
+#define KBLU "\x1B[34m"
+#define KMAG "\x1B[35m"
+#define KCYN "\x1B[36m"
+#define KWHT "\x1B[37m"

-#define failed(...) \
-    printf ("%serror: ", KRED);\
-    printf (__VA_ARGS__);\
-    printf ("\n");\
-    printf ("error: TEST FAILED\n%s", KNRM );\
+#define failed(...)                                                                                \
+    printf("%serror: ", KRED);                                                                     \
+    printf(__VA_ARGS__);                                                                           \
+    printf("\n");                                                                                  \
+    printf("error: TEST FAILED\n%s", KNRM);                                                        \
    exit(EXIT_FAILURE);

-#define HIPCHECK(error) \
-    if (error != hipSuccess) { \
-        printf("%serror: '%s'(%d) at %s:%d%s\n", \
-               KRED, hipGetErrorString(error), error,\
-               __FILE__, __LINE__,KNRM);\
-        failed("API returned error code.");\
+#define HIPCHECK(error)                                                                            \
+    if (error != hipSuccess) {                                                                     \
+        printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, hipGetErrorString(error), error, __FILE__,  \
+               __LINE__, KNRM);                                                                    \
+        failed("API returned error code.");                                                        \
    }

-void printCompilerInfo ()
-{
+void printCompilerInfo() {
 #ifdef __HCC__
-    printf ("compiler: hcc version=%s, workweek (YYWWD) = %u\n", __hcc_version__, __hcc_workweek__);
+    printf("compiler: hcc version=%s, workweek (YYWWD) = %u\n", __hcc_version__, __hcc_workweek__);
 #endif
 #ifdef __NVCC__
-    printf ("compiler: nvcc\n");
+    printf("compiler: nvcc\n");
 #endif
 }

-double bytesToGB(size_t s)
-{
-    return (double)s / (1024.0*1024.0*1024.0);
-}
+double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }

-#define printLimit(w1, limit, units) \
-{\
-    size_t val;\
-    cudaDeviceGetLimit(&val, limit);\
-    std::cout << setw(w1) << #limit": " << val << " " << units << std::endl;\
-}
+#define printLimit(w1, limit, units)                                                               \
+    {                                                                                              \
+        size_t val;                                                                                \
+        cudaDeviceGetLimit(&val, limit);                                                           \
+        std::cout << setw(w1) << #limit ": " << val << " " << units << std::endl;                  \
+    }


-void printDeviceProp (int deviceId)
-{
+void printDeviceProp(int deviceId) {
    using namespace std;
    const int w1 = 34;

    cout << left;

-    cout << setw(w1) << "--------------------------------------------------------------------------------" << endl;
+    cout << setw(w1)
+         << "--------------------------------------------------------------------------------"
+         << endl;
    cout << setw(w1) << "device#" << deviceId << endl;

    hipDeviceProp_t props;
@@ -88,16 +84,22 @@ void printDeviceProp (int deviceId)
    cout << setw(w1) << "pciBusID: " << props.pciBusID << endl;
    cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl;
    cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl;
-    cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl;
+    cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
+         << endl;
    cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl;
    cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl;
-    cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl;
+    cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz"
+         << endl;
    cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl;
-    cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl;
-    cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl;
-    cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl;
+    cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0
+         << " Mhz" << endl;
+    cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2)
+         << bytesToGB(props.totalGlobalMem) << " GB" << endl;
+    cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
+         << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl;
    cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl;
-    cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl;
+    cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
+         << endl;
    cout << setw(w1) << "regsPerBlock: " << props.regsPerBlock << endl;
    cout << setw(w1) << "warpSize: " << props.warpSize << endl;
    cout << setw(w1) << "l2CacheSize: " << props.l2CacheSize << endl;
@@ -112,29 +114,31 @@ void printDeviceProp (int deviceId)
    cout << setw(w1) << "major: " << props.major << endl;
    cout << setw(w1) << "minor: " << props.minor << endl;
    cout << setw(w1) << "concurrentKernels: " << props.concurrentKernels << endl;
-    cout << setw(w1) << "arch.hasGlobalInt32Atomics: " <<     props.arch.hasGlobalInt32Atomics    << endl;
-    cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " <<  props.arch.hasGlobalFloatAtomicExch << endl;
-    cout << setw(w1) << "arch.hasSharedInt32Atomics: " <<     props.arch.hasSharedInt32Atomics    << endl;
-    cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " <<  props.arch.hasSharedFloatAtomicExch << endl;
-    cout << setw(w1) << "arch.hasFloatAtomicAdd: " <<         props.arch.hasFloatAtomicAdd        << endl;
-    cout << setw(w1) << "arch.hasGlobalInt64Atomics: " <<     props.arch.hasGlobalInt64Atomics    << endl;
-    cout << setw(w1) << "arch.hasSharedInt64Atomics: " <<     props.arch.hasSharedInt64Atomics    << endl;
-    cout << setw(w1) << "arch.hasDoubles: " <<                props.arch.hasDoubles               << endl;
-    cout << setw(w1) << "arch.hasWarpVote: " <<               props.arch.hasWarpVote              << endl;
-    cout << setw(w1) << "arch.hasWarpBallot: " <<             props.arch.hasWarpBallot            << endl;
-    cout << setw(w1) << "arch.hasWarpShuffle: " <<            props.arch.hasWarpShuffle           << endl;
-    cout << setw(w1) << "arch.hasFunnelShift: " <<            props.arch.hasFunnelShift           << endl;
-    cout << setw(w1) << "arch.hasThreadFenceSystem: " <<      props.arch.hasThreadFenceSystem     << endl;
-    cout << setw(w1) << "arch.hasSyncThreadsExt: " <<         props.arch.hasSyncThreadsExt        << endl;
-    cout << setw(w1) << "arch.hasSurfaceFuncs: " <<           props.arch.hasSurfaceFuncs          << endl;
-    cout << setw(w1) << "arch.has3dGrid: " <<                 props.arch.has3dGrid                << endl;
-    cout << setw(w1) << "arch.hasDynamicParallelism: " <<     props.arch.hasDynamicParallelism    << endl;
-    cout << setw(w1) << "gcnArch: " <<     props.gcnArch    << endl;
+    cout << setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << endl;
+    cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
+         << endl;
+    cout << setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << endl;
+    cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
+         << endl;
+    cout << setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << endl;
+    cout << setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << endl;
+    cout << setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << endl;
+    cout << setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << endl;
+    cout << setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << endl;
+    cout << setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << endl;
+    cout << setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << endl;
+    cout << setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << endl;
+    cout << setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << endl;
+    cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl;
+    cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl;
+    cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl;
+    cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl;
+    cout << setw(w1) << "gcnArch: " << props.gcnArch << endl;

    int deviceCnt;
    hipGetDeviceCount(&deviceCnt);
    cout << setw(w1) << "peers: ";
-    for (int i=0; i<deviceCnt; i++) {
+    for (int i = 0; i < deviceCnt; i++) {
        int isPeer;
        hipDeviceCanAccessPeer(&isPeer, i, deviceId);
        if (isPeer) {
@@ -143,7 +147,7 @@ void printDeviceProp (int deviceId)
    }
    cout << endl;
    cout << setw(w1) << "non-peers: ";
-    for (int i=0; i<deviceCnt; i++) {
+    for (int i = 0; i < deviceCnt; i++) {
        int isPeer;
        hipDeviceCanAccessPeer(&isPeer, i, deviceId);
        if (!isPeer) {
@@ -164,8 +168,6 @@ void printDeviceProp (int deviceId)
 #endif


-
-
    cout << endl;


@@ -174,11 +176,11 @@ void printDeviceProp (int deviceId)

    cout << fixed << setprecision(2);
    cout << setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << endl;
-    cout << setw(w1) << "memInfo.free:  " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl;
+    cout << setw(w1) << "memInfo.free:  " << bytesToGB(free) << " GB (" << setprecision(0)
+         << (float)free / total * 100.0 << "%)" << endl;
 }

-int main(int argc, char *argv[])
-{
+int main(int argc, char* argv[]) {
    using namespace std;

    cout << endl;
@@ -189,7 +191,7 @@ int main(int argc, char *argv[])

    HIPCHECK(hipGetDeviceCount(&deviceCnt));

-    for (int i=0; i< deviceCnt; i++) {
+    for (int i = 0; i < deviceCnt; i++) {
        printDeviceProp(i);
    }

@@ -20,28 +20,24 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"


-#define WIDTH     1024
+#define WIDTH 1024


-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -49,88 +45,79 @@ __global__ void matrixTranspose(hipLaunchParm lp,
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

-  return errors;
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
+                    gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -20,155 +20,141 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"

-#define WIDTH     1024
+#define WIDTH 1024

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
- 
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

-    asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));
+    asm volatile("v_mov_b32_e32 %0, %1" : "=v"(out[x * width + y]) : "v"(in[y * width + x]));
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    hipEvent_t start, stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);
+    float eventMs = 1.0f;

-  hipEvent_t start, stop;
-  hipEventCreate(&start);
-  hipEventCreate(&stop);
-  float eventMs = 1.0f;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Record the start event
-  hipEventRecord(start, NULL);
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Record the stop event
-  hipEventRecord(stop, NULL);
-  hipEventSynchronize(stop);
-
-  hipEventElapsedTime(&eventMs, start, stop);
-
-  printf ("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
-
-  // Record the start event
-  hipEventRecord(start, NULL);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Record the stop event
-  hipEventRecord(stop, NULL);
-  hipEventSynchronize(stop);
-
-  hipEventElapsedTime(&eventMs, start, stop);
-
-  printf ("kernel Execution time             = %6.3fms\n", eventMs);
-
-  // Record the start event
-  hipEventRecord(start, NULL);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // Record the stop event
-  hipEventRecord(stop, NULL);
-  hipEventSynchronize(stop);
-
-  hipEventElapsedTime(&eventMs, start, stop);
-
-  printf ("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Record the start event
+    hipEventRecord(start, NULL);

-  return errors;
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
+
+    // Record the stop event
+    hipEventRecord(stop, NULL);
+    hipEventSynchronize(stop);
+
+    hipEventElapsedTime(&eventMs, start, stop);
+
+    printf("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
+
+    // Record the start event
+    hipEventRecord(start, NULL);
+
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
+                    gpuMatrix, WIDTH);
+
+    // Record the stop event
+    hipEventRecord(stop, NULL);
+    hipEventSynchronize(stop);
+
+    hipEventElapsedTime(&eventMs, start, stop);
+
+    printf("kernel Execution time             = %6.3fms\n", eventMs);
+
+    // Record the start event
+    hipEventRecord(start, NULL);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // Record the stop event
+    hipEventRecord(stop, NULL);
+    hipEventSynchronize(stop);
+
+    hipEventElapsedTime(&eventMs, start, stop);
+
+    printf("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            printf("gpu%f cpu %f \n", TransposeMatrix[i], cpuTransposeMatrix[i]);
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -23,11 +23,8 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 extern texture<float, 2, hipReadModeElementType> tex;

-__global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
-                             int width,
-                             int height)
-{
-    int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
-    outputData[y*width + x] = tex2D(tex, x, y);
+__global__ void tex2dKernel(hipLaunchParm lp, float* outputData, int width, int height) {
+    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    outputData[y * width + x] = tex2D(tex, x, y);
 }
@@ -32,111 +32,113 @@ THE SOFTWARE.
 texture<float, 2, hipReadModeElementType> tex;
 bool testResult = false;

-#define HIP_CHECK(cmd) \
-{\
-    hipError_t status = cmd;\
-    if(status != hipSuccess) {std::cout<<"error: #"<<status<<" ("<< hipGetErrorString(status) << ") at line:"<<__LINE__<<":  "<<#cmd<<std::endl;abort();}\
-}
+#define HIP_CHECK(cmd)                                                                             \
+    {                                                                                              \
+        hipError_t status = cmd;                                                                   \
+        if (status != hipSuccess) {                                                                \
+            std::cout << "error: #" << status << " (" << hipGetErrorString(status)                 \
+                      << ") at line:" << __LINE__ << ":  " << #cmd << std::endl;                   \
+            abort();                                                                               \
+        }                                                                                          \
+    }

-bool runTest(int argc, char **argv)
-{
+bool runTest(int argc, char** argv) {
    unsigned int width = 256;
    unsigned int height = 256;
    unsigned int size = width * height * sizeof(float);
-    float* hData = (float*) malloc(size);
+    float* hData = (float*)malloc(size);
    memset(hData, 0, size);
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
-            hData[i*width+j] = i*width+j;
+            hData[i * width + j] = i * width + j;
        }
    }
    hipModule_t Module;
    HIP_CHECK(hipModuleLoad(&Module, fileName));

    hipArray* array;
-	HIP_ARRAY_DESCRIPTOR desc;
-	desc.format = HIP_AD_FORMAT_FLOAT;
-	desc.numChannels = 1;
-	desc.width = width;
-	desc.height = height;
+    HIP_ARRAY_DESCRIPTOR desc;
+    desc.format = HIP_AD_FORMAT_FLOAT;
+    desc.numChannels = 1;
+    desc.width = width;
+    desc.height = height;
    hipArrayCreate(&array, &desc);

    hip_Memcpy2D copyParam;
-	memset(&copyParam, 0, sizeof(copyParam));
-	copyParam.dstMemoryType = hipMemoryTypeArray;
-	copyParam.dstArray = array;
-	copyParam.srcMemoryType = hipMemoryTypeHost;
-	copyParam.srcHost = hData;
-	copyParam.srcPitch = width * sizeof(float);
-	copyParam.widthInBytes = copyParam.srcPitch;
-	copyParam.height = height;
+    memset(&copyParam, 0, sizeof(copyParam));
+    copyParam.dstMemoryType = hipMemoryTypeArray;
+    copyParam.dstArray = array;
+    copyParam.srcMemoryType = hipMemoryTypeHost;
+    copyParam.srcHost = hData;
+    copyParam.srcPitch = width * sizeof(float);
+    copyParam.widthInBytes = copyParam.srcPitch;
+    copyParam.height = height;
    hipMemcpyParam2D(&copyParam);
-    
+
    textureReference* texref;
    hipModuleGetTexRef(&texref, Module, "tex");
    hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
-	hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
-	hipTexRefSetFilterMode(texref, hipFilterModePoint);
+    hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
+    hipTexRefSetFilterMode(texref, hipFilterModePoint);
    hipTexRefSetFlags(texref, 0);
-	hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
+    hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
    hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);

    float* dData = NULL;
-    hipMalloc((void **) &dData, size);
+    hipMalloc((void**)&dData, size);

 #ifdef __HIP_PLATFORM_HCC__

-	struct {
-		uint32_t _hidden[6];  // genco path + wrapper-gen pass used hidden arguments.
-		void * _Ad;
-		unsigned int _Bd;
-		unsigned int _Cd;
-	} args;
+    struct {
+        uint32_t _hidden[6];  // genco path + wrapper-gen pass used hidden arguments.
+        void* _Ad;
+        unsigned int _Bd;
+        unsigned int _Cd;
+    } args;
    args._Ad = dData;
-	args._Bd = width;
-	args._Cd = height;
+    args._Bd = width;
+    args._Cd = height;

 #endif

 #ifdef __HIP_PLATFORM_NVCC__
-	struct {
-		uint32_t _hidden[1];
-		void * _Ad;
-		unsigned int _Bd;
-		unsigned int _Cd;
-	} args;
+    struct {
+        uint32_t _hidden[1];
+        void* _Ad;
+        unsigned int _Bd;
+        unsigned int _Cd;
+    } args;

-	args._hidden[0] = 0;
-	args._Ad = dData;
+    args._hidden[0] = 0;
+    args._Ad = dData;
    args._Bd = width;
-	args._Cd = height;
+    args._Cd = height;
 #endif


-	size_t sizeTemp = sizeof(args);
+    size_t sizeTemp = sizeof(args);

-	void *config[] = {
-	  HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
-	  HIP_LAUNCH_PARAM_BUFFER_SIZE, &sizeTemp,
-	  HIP_LAUNCH_PARAM_END
-	};
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                      &sizeTemp, HIP_LAUNCH_PARAM_END};

-	hipFunction_t Function;
-	HIP_CHECK(hipModuleGetFunction(&Function, Module, "tex2dKernel"));
+    hipFunction_t Function;
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, "tex2dKernel"));

-    int temp1= width/16;
-    int temp2 = height/16;
-    HIP_CHECK(hipModuleLaunchKernel(Function, 16, 16, 1, temp1, temp2, 1, 0, 0, NULL, (void**)&config));
+    int temp1 = width / 16;
+    int temp2 = height / 16;
+    HIP_CHECK(
+        hipModuleLaunchKernel(Function, 16, 16, 1, temp1, temp2, 1, 0, 0, NULL, (void**)&config));
    hipDeviceSynchronize();

-    float *hOutputData = (float *) malloc(size);
-    memset(hOutputData, 0,  size);
+    float* hOutputData = (float*)malloc(size);
+    memset(hOutputData, 0, size);
    hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);

    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
-            if (hData[i*width+j] != hOutputData[i*width+j]) {
-                printf("Difference [ %d %d ]:%f ----%f\n",i, j, hData[i*width+j] , hOutputData[i*width+j]);
+            if (hData[i * width + j] != hOutputData[i * width + j]) {
+                printf("Difference [ %d %d ]:%f ----%f\n", i, j, hData[i * width + j],
+                       hOutputData[i * width + j]);
                testResult = false;
                break;
            }
@@ -147,7 +149,7 @@ bool runTest(int argc, char **argv)
    return true;
 }

-int main(int argc, char **argv){
+int main(int argc, char** argv) {
    hipInit(0);
    testResult = runTest(argc, argv);
    printf("%s ...\n", testResult ? "PASSED" : "FAILED");
@@ -20,28 +20,24 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"


-#define WIDTH     1024
+#define WIDTH 1024


-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -49,88 +45,79 @@ __global__ void matrixTranspose(hipLaunchParm lp,
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

-  return errors;
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
+                    gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -20,26 +20,22 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"

-#define WIDTH     1024
+#define WIDTH 1024

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -47,126 +43,117 @@ __global__ void matrixTranspose(hipLaunchParm lp,
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    hipEvent_t start, stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);
+    float eventMs = 1.0f;

-  hipEvent_t start, stop;
-  hipEventCreate(&start);
-  hipEventCreate(&stop);
-  float eventMs = 1.0f;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Record the start event
-  hipEventRecord(start, NULL);
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Record the stop event
-  hipEventRecord(stop, NULL);
-  hipEventSynchronize(stop);
-
-  hipEventElapsedTime(&eventMs, start, stop);
-
-  printf ("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
-
-  // Record the start event
-  hipEventRecord(start, NULL);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Record the stop event
-  hipEventRecord(stop, NULL);
-  hipEventSynchronize(stop);
-
-  hipEventElapsedTime(&eventMs, start, stop);
-
-  printf ("kernel Execution time             = %6.3fms\n", eventMs);
-
-  // Record the start event
-  hipEventRecord(start, NULL);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // Record the stop event
-  hipEventRecord(stop, NULL);
-  hipEventSynchronize(stop);
-
-  hipEventElapsedTime(&eventMs, start, stop);
-
-  printf ("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Record the start event
+    hipEventRecord(start, NULL);

-  return errors;
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
+
+    // Record the stop event
+    hipEventRecord(stop, NULL);
+    hipEventSynchronize(stop);
+
+    hipEventElapsedTime(&eventMs, start, stop);
+
+    printf("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
+
+    // Record the start event
+    hipEventRecord(start, NULL);
+
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
+                    gpuMatrix, WIDTH);
+
+    // Record the stop event
+    hipEventRecord(stop, NULL);
+    hipEventSynchronize(stop);
+
+    hipEventElapsedTime(&eventMs, start, stop);
+
+    printf("kernel Execution time             = %6.3fms\n", eventMs);
+
+    // Record the start event
+    hipEventRecord(start, NULL);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // Record the stop event
+    hipEventRecord(stop, NULL);
+    hipEventSynchronize(stop);
+
+    hipEventElapsedTime(&eventMs, start, stop);
+
+    printf("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -20,33 +20,29 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"
 #include "hip/hip_profile.h"

-#define WIDTH     1024
+#define WIDTH 1024

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 #define ITERATIONS 10

 // Cmdline parms to control start and stop triggers
-int startTriggerIteration=-1;
-int stopTriggerIteration=-1;
+int startTriggerIteration = -1;
+int stopTriggerIteration = -1;

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -54,180 +50,171 @@ __global__ void matrixTranspose(hipLaunchParm lp,
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }


 // Use a separate function to demonstrate how to use function name as part of scoped marker:
-void runGPU(float *Matrix, float *TransposeMatrix, 
-            float* gpuMatrix, float* gpuTransposeMatrix)  {
+void runGPU(float* Matrix, float* TransposeMatrix, float* gpuMatrix, float* gpuTransposeMatrix) {
+    // __func__ is a standard C++ macro which expands to the name of the function, in this case
+    // "runGPU"
+    HIP_SCOPED_MARKER(__func__, "MyGroup");

-  // __func__ is a standard C++ macro which expands to the name of the function, in this case "runGPU"
-  HIP_SCOPED_MARKER(__func__, "MyGroup");
+    for (int i = 0; i < ITERATIONS; i++) {
+        if (i == startTriggerIteration) {
+            hipProfilerStart();
+        }
+        if (i == stopTriggerIteration) {
+            hipProfilerStop();
+        }

-  for (int i=0; i<ITERATIONS; i++) {
+        float eventMs = 0.0f;

-    if (i==startTriggerIteration) {
-      hipProfilerStart();
+        hipEvent_t start, stop;
+        hipEventCreate(&start);
+        hipEventCreate(&stop);
+
+
+        // Record the start event
+        hipEventRecord(start, NULL);
+
+        // Memory transfer from host to device
+        hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
+
+        // Record the stop event
+        hipEventRecord(stop, NULL);
+        hipEventSynchronize(stop);
+
+        hipEventElapsedTime(&eventMs, start, stop);
+
+        printf("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
+
+        // Record the start event
+        hipEventRecord(start, NULL);
+
+        // Lauching kernel from host
+        hipLaunchKernel(matrixTranspose,
+                        dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
+                        gpuMatrix, WIDTH);
+
+        // Record the stop event
+        hipEventRecord(stop, NULL);
+        hipEventSynchronize(stop);
+        hipEventElapsedTime(&eventMs, start, stop);
+
+        printf("kernel Execution time             = %6.3fms\n", eventMs);
+
+        // Record the start event
+        hipEventRecord(start, NULL);
+
+        // Memory transfer from device to host
+        hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+        // Record the stop event
+        hipEventRecord(stop, NULL);
+        hipEventSynchronize(stop);
+
+        hipEventElapsedTime(&eventMs, start, stop);
+
+        printf("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
    }
-    if (i==stopTriggerIteration) {
-      hipProfilerStop();
-    }
-
-    float eventMs = 0.0f;
-
-    hipEvent_t start, stop;
-    hipEventCreate(&start);
-    hipEventCreate(&stop);
-
-
-    // Record the start event
-    hipEventRecord(start, NULL);
-
-    // Memory transfer from host to device
-    hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-    // Record the stop event
-    hipEventRecord(stop, NULL);
-    hipEventSynchronize(stop);
-
-    hipEventElapsedTime(&eventMs, start, stop);
-
-    printf ("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
-
-    // Record the start event
-    hipEventRecord(start, NULL);
-
-    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    0, 0,
-                    gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-    // Record the stop event
-    hipEventRecord(stop, NULL);
-    hipEventSynchronize(stop);
-    hipEventElapsedTime(&eventMs, start, stop);
-
-    printf ("kernel Execution time             = %6.3fms\n", eventMs);
-
-    // Record the start event
-    hipEventRecord(start, NULL);
-
-    // Memory transfer from device to host
-    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-    // Record the stop event
-    hipEventRecord(stop, NULL);
-    hipEventSynchronize(stop);
-
-    hipEventElapsedTime(&eventMs, start, stop);
-
-    printf ("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
-  }
 };


-int main(int argc, char *argv[]) {
-
-  if (argc >= 2) {
-    startTriggerIteration = atoi(argv[1]);
-    printf ("info : will start tracing at iteration:%d\n", startTriggerIteration);
-  } 
-  if (argc >= 3) {
-    stopTriggerIteration = atoi(argv[2]);
-    printf ("info : will stop tracing at iteration:%d\n", stopTriggerIteration);
-  }
-
-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
-
-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
-
-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
-
-  std::cout << "Device name " << devProp.name << std::endl;
-
-  {
-      // Show example of how to create a "scoped marker".  
-      // The scoped marker records the time spent inside the { scope } of the marker - the begin timestamp is at the
-      // beginning of the code scope, and the end is recorded when the SCOPE exits.  This can be viewed in CodeXL
-      // timeline relative to other GPU and CPU events.
-      // This marker captures the time spent in setup including host allocation, initialization, and device memory allocation.
-      HIP_SCOPED_MARKER("Setup", "MyGroup");
-
-
-
-      Matrix = (float*)malloc(NUM * sizeof(float));
-      TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-      cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-      // initialize the input data
-      for (int i = 0; i < NUM; i++) {
-        Matrix[i] = (float)i*10.0f;
-      }
-
-
-      // allocate the memory on the device side
-      hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-      hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-      // FYI, the scoped-marker will be destroyed here when the scope exits, and will record its "end" timestamp.
-  }
-
-  runGPU(Matrix, TransposeMatrix, gpuMatrix, gpuTransposeMatrix);
-
-
-  // show how to use explicit begin/end markers:
-  // We begin the timed region with HIP_BEGIN_MARKER, passing in the markerName and group:
-  // The region will stop when HIP_END_MARKER is called
-  // This is another way to mark begin/end - as an alternative to scoped markers.
-  HIP_BEGIN_MARKER("Check&TearDown", "MyGroup");
-
-  int errors = 0;
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  double eps = 1.0E-6;
-  for (int i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-      errors++;
+int main(int argc, char* argv[]) {
+    if (argc >= 2) {
+        startTriggerIteration = atoi(argv[1]);
+        printf("info : will start tracing at iteration:%d\n", startTriggerIteration);
+    }
+    if (argc >= 3) {
+        stopTriggerIteration = atoi(argv[2]);
+        printf("info : will stop tracing at iteration:%d\n", stopTriggerIteration);
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  // This ends the last marker started in this thread, in this case "Check&TearDown"
-  HIP_END_MARKER();  
-  
-  return errors;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);
+
+    std::cout << "Device name " << devProp.name << std::endl;
+
+    {
+        // Show example of how to create a "scoped marker".
+        // The scoped marker records the time spent inside the { scope } of the marker - the begin
+        // timestamp is at the beginning of the code scope, and the end is recorded when the SCOPE
+        // exits.  This can be viewed in CodeXL timeline relative to other GPU and CPU events. This
+        // marker captures the time spent in setup including host allocation, initialization, and
+        // device memory allocation.
+        HIP_SCOPED_MARKER("Setup", "MyGroup");
+
+
+        Matrix = (float*)malloc(NUM * sizeof(float));
+        TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+        cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+        // initialize the input data
+        for (int i = 0; i < NUM; i++) {
+            Matrix[i] = (float)i * 10.0f;
+        }
+
+
+        // allocate the memory on the device side
+        hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+        hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+        // FYI, the scoped-marker will be destroyed here when the scope exits, and will record its
+        // "end" timestamp.
+    }
+
+    runGPU(Matrix, TransposeMatrix, gpuMatrix, gpuTransposeMatrix);
+
+
+    // show how to use explicit begin/end markers:
+    // We begin the timed region with HIP_BEGIN_MARKER, passing in the markerName and group:
+    // The region will stop when HIP_END_MARKER is called
+    // This is another way to mark begin/end - as an alternative to scoped markers.
+    HIP_BEGIN_MARKER("Check&TearDown", "MyGroup");
+
+    int errors = 0;
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    double eps = 1.0E-6;
+    for (int i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    // This ends the last marker started in this thread, in this case "Check&TearDown"
+    HIP_END_MARKER();
+
+    return errors;
 }
@@ -20,28 +20,24 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"


-#define WIDTH     64
+#define WIDTH 64

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
-    __shared__ float sharedMem[WIDTH*WIDTH];
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+    __shared__ float sharedMem[WIDTH * WIDTH];

    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
@@ -54,89 +50,80 @@ __global__ void matrixTranspose(hipLaunchParm lp,
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

-  return errors;
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
+                    gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            printf("%d cpu: %f gpu  %f\n", i, cpuTransposeMatrix[i], TransposeMatrix[i]);
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -20,122 +20,106 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"


-#define WIDTH     4
+#define WIDTH 4

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    float val = in[x];

-    for(int i=0;i<width;i++)
-    {
-        for(int j=0;j<width;j++)
-            out[i*width + j] = __shfl(val,j*width + i);
+    for (int i = 0; i < width; i++) {
+        for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i);
    }
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

-  return errors;
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+                    gpuTransposeMatrix, gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            printf("%d cpu: %f gpu  %f\n", i, cpuTransposeMatrix[i], TransposeMatrix[i]);
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -20,118 +20,104 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"


-#define WIDTH     4
+#define WIDTH 4

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
-    float val = in[y*width + x];
+    float val = in[y * width + x];

-    out[x*width + y] = __shfl(val,y*width + x);
+    out[x * width + y] = __shfl(val, y * width + x);
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

-  return errors;
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
+                    gpuTransposeMatrix, gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            printf("%d cpu: %f gpu  %f\n", i, cpuTransposeMatrix[i], TransposeMatrix[i]);
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -19,26 +19,22 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"

-#define WIDTH     16
+#define WIDTH 16

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem);

@@ -53,89 +49,80 @@ __global__ void matrixTranspose(hipLaunchParm lp,
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  sizeof(float)*WIDTH*WIDTH, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("dynamic_shared PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

-  return errors;
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
+                    0, gpuTransposeMatrix, gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            printf("%d cpu: %f gpu  %f\n", i, cpuTransposeMatrix[i], TransposeMatrix[i]);
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("dynamic_shared PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
@@ -20,22 +20,19 @@ THE SOFTWARE.
 #include <iostream>
 #include <hip/hip_runtime.h>

-#define WIDTH     32
+#define WIDTH 32

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 using namespace std;

-__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
-    __shared__ float sharedMem[WIDTH*WIDTH];
+__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+                                              const int width) {
+    __shared__ float sharedMem[WIDTH * WIDTH];

    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
@@ -47,11 +44,8 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp,
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)

@@ -65,39 +59,34 @@ __global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
    out[y * width + x] = sharedMem[y * width + x];
 }

-void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width)
-{
+void MultipleStream(float** data, float* randArray, float** gpuTransposeMatrix,
+                    float** TransposeMatrix, int width) {
    const int num_streams = 2;
    hipStream_t streams[num_streams];

-    for(int i=0;i<num_streams;i++)
-        hipStreamCreate(&streams[i]);
+    for (int i = 0; i < num_streams; i++) hipStreamCreate(&streams[i]);

-    for(int i=0;i<num_streams;i++)
-    {
+    for (int i = 0; i < num_streams; i++) {
        hipMalloc((void**)&data[i], NUM * sizeof(float));
-        hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice,streams[i]);
+        hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice, streams[i]);
    }

    hipLaunchKernel(matrixTranspose_static_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    0, streams[0],
+                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[0],
                    gpuTransposeMatrix[0], data[0], width);

    hipLaunchKernel(matrixTranspose_dynamic_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    sizeof(float)*WIDTH*WIDTH, streams[1],
-                    gpuTransposeMatrix[1], data[1], width);
-
-    for(int i=0;i<num_streams;i++)
-    hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[i]);
+                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
+                    streams[1], gpuTransposeMatrix[1], data[1], width);

+    for (int i = 0; i < num_streams; i++)
+        hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM * sizeof(float),
+                       hipMemcpyDeviceToHost, streams[i]);
 }

-int main(){
-
+int main() {
    hipSetDevice(0);

    float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;
@@ -112,9 +101,8 @@ int main(){
    hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
    hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));

-    for(int i = 0; i < NUM; i++)
-    {
-        randArray[i] = (float)i*1.0f;
+    for (int i = 0; i < NUM; i++) {
+        randArray[i] = (float)i * 1.0f;
    }

    MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);
@@ -125,22 +113,22 @@ int main(){
    int errors = 0;
    double eps = 1.0E-6;
    for (int i = 0; i < NUM; i++) {
-        if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps ) {
-        printf("%d stream0: %f stream1  %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]);
-        errors++;
+        if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps) {
+            printf("%d stream0: %f stream1  %f\n", i, TransposeMatrix[0][i], TransposeMatrix[1][i]);
+            errors++;
        }
    }
-    if (errors!=0) {
-        printf("FAILED: %d errors\n",errors);
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
    } else {
-        printf ("stream PASSED!\n");
+        printf("stream PASSED!\n");
    }

    free(randArray);
-    for(int i=0;i<2;i++){
-       hipFree(data[i]);
-       hipFree(gpuTransposeMatrix[i]);
-       free(TransposeMatrix[i]);
+    for (int i = 0; i < 2; i++) {
+        hipFree(data[i]);
+        hipFree(gpuTransposeMatrix[i]);
+        free(TransposeMatrix[i]);
    }

    hipDeviceReset();
@@ -20,105 +20,94 @@ THE SOFTWARE.
 #include <iostream>
 #include <hip/hip_runtime.h>
 #include <assert.h>
-#define WIDTH     32
+#define WIDTH 32

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 using namespace std;

-#define KNRM  "\x1B[0m"
-#define KRED  "\x1B[31m"
+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"

-#define failed(...) \
-    printf ("%serror: ", KRED);\
-    printf (__VA_ARGS__);\
-    printf ("\n");\
-    printf ("error: TEST FAILED\n%s", KNRM );\
+#define failed(...)                                                                                \
+    printf("%serror: ", KRED);                                                                     \
+    printf(__VA_ARGS__);                                                                           \
+    printf("\n");                                                                                  \
+    printf("error: TEST FAILED\n%s", KNRM);                                                        \
    abort();

-#define HIPCHECK(error) \
-{\
-    hipError_t localError = error; \
-    if (localError != hipSuccess) { \
-        printf("%serror: '%s'(%d) from %s at %s:%d%s\n", \
-        KRED, hipGetErrorString(localError), localError,\
-        #error,__FILE__, __LINE__, KNRM); \
-        failed("API returned error code.");\
-    }\
-}
+#define HIPCHECK(error)                                                                            \
+    {                                                                                              \
+        hipError_t localError = error;                                                             \
+        if (localError != hipSuccess) {                                                            \
+            printf("%serror: '%s'(%d) from %s at %s:%d%s\n", KRED, hipGetErrorString(localError),  \
+                   localError, #error, __FILE__, __LINE__, KNRM);                                  \
+            failed("API returned error code.");                                                    \
+        }                                                                                          \
+    }

-void checkPeer2PeerSupport()
-{
+void checkPeer2PeerSupport() {
    int gpuCount;
    int canAccessPeer;

    HIPCHECK(hipGetDeviceCount(&gpuCount));

-    for (int currentGpu=0; currentGpu<gpuCount; currentGpu++)
-    {
+    for (int currentGpu = 0; currentGpu < gpuCount; currentGpu++) {
        HIPCHECK(hipSetDevice(currentGpu));

-        for (int peerGpu=0; peerGpu<currentGpu; peerGpu++)
-        {
-            if (currentGpu!=peerGpu)
-            {
+        for (int peerGpu = 0; peerGpu < currentGpu; peerGpu++) {
+            if (currentGpu != peerGpu) {
                HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu));
-                printf ("currentGpu#%d canAccessPeer: peerGpu#%d=%d\n", currentGpu, peerGpu, canAccessPeer);
+                printf("currentGpu#%d canAccessPeer: peerGpu#%d=%d\n", currentGpu, peerGpu,
+                       canAccessPeer);
            }

            HIPCHECK(hipSetDevice(peerGpu));
            HIPCHECK(hipDeviceReset());
        }
-    HIPCHECK(hipSetDevice(currentGpu));
-    HIPCHECK(hipDeviceReset());
+        HIPCHECK(hipSetDevice(currentGpu));
+        HIPCHECK(hipDeviceReset());
    }
 }

-void enablePeer2Peer(int currentGpu, int peerGpu)
-{
+void enablePeer2Peer(int currentGpu, int peerGpu) {
    int canAccessPeer;

    // Must be on a multi-gpu system:
-    assert (currentGpu != peerGpu);
+    assert(currentGpu != peerGpu);

    HIPCHECK(hipSetDevice(currentGpu));
    hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);

-    if(canAccessPeer==1){
+    if (canAccessPeer == 1) {
        HIPCHECK(hipDeviceEnablePeerAccess(peerGpu, 0));
-    }
-    else
-    printf("peer2peer transfer not possible between the selected gpu devices");
+    } else
+        printf("peer2peer transfer not possible between the selected gpu devices");
 }

-void disablePeer2Peer(int currentGpu, int peerGpu)
-{
+void disablePeer2Peer(int currentGpu, int peerGpu) {
    int canAccessPeer;

    // Must be on a multi-gpu system:
-    assert (currentGpu != peerGpu);
+    assert(currentGpu != peerGpu);

    HIPCHECK(hipSetDevice(currentGpu));
    hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);

-    if(canAccessPeer==1){
+    if (canAccessPeer == 1) {
        HIPCHECK(hipDeviceDisablePeerAccess(peerGpu));
-    }
-    else
-    printf("peer2peer disable not required");
+    } else
+        printf("peer2peer disable not required");
 }


-__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
-    __shared__ float sharedMem[WIDTH*WIDTH];
+__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+                                              const int width) {
+    __shared__ float sharedMem[WIDTH * WIDTH];

    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
@@ -130,11 +119,8 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp,
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)

@@ -148,8 +134,7 @@ __global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
    out[y * width + x] = sharedMem[y * width + x];
 }

-int main(){
-
+int main() {
    checkPeer2PeerSupport();

    int gpuCount;
@@ -157,8 +142,7 @@ int main(){

    HIPCHECK(hipGetDeviceCount(&gpuCount));

-    if (gpuCount < 2)
-    {
+    if (gpuCount < 2) {
        printf("Peer2Peer application requires atleast 2 gpu devices");
        return 0;
    }
@@ -166,7 +150,7 @@ int main(){
    currentGpu = 0;
    peerGpu = (currentGpu + 1);

-    printf ("currentGpu=%d peerGpu=%d (Total no. of gpu = %d)\n", currentGpu, peerGpu, gpuCount);
+    printf("currentGpu=%d peerGpu=%d (Total no. of gpu = %d)\n", currentGpu, peerGpu, gpuCount);

    float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;

@@ -174,9 +158,8 @@ int main(){

    randArray = (float*)malloc(NUM * sizeof(float));

-    for(int i = 0; i < NUM; i++)
-    {
-        randArray[i] = (float)i*1.0f;
+    for (int i = 0; i < NUM; i++) {
+        randArray[i] = (float)i * 1.0f;
    }

    enablePeer2Peer(currentGpu, peerGpu);
@@ -188,10 +171,9 @@ int main(){
    hipMemcpy(data[0], randArray, NUM * sizeof(float), hipMemcpyHostToDevice);

    hipLaunchKernel(matrixTranspose_static_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    0, 0,
-                    gpuTransposeMatrix[0], data[0], width);
+                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix[0],
+                    data[0], width);

    HIPCHECK(hipSetDevice(peerGpu));
    TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float));
@@ -200,12 +182,12 @@ int main(){
    hipMemcpy(data[1], gpuTransposeMatrix[0], NUM * sizeof(float), hipMemcpyDeviceToDevice);

    hipLaunchKernel(matrixTranspose_dynamic_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    sizeof(float)*WIDTH*WIDTH, 0,
-                    gpuTransposeMatrix[1], data[1], width);
+                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
+                    0, gpuTransposeMatrix[1], data[1], width);

-    hipMemcpy(TransposeMatrix[1], gpuTransposeMatrix[1], NUM*sizeof(float), hipMemcpyDeviceToHost);
+    hipMemcpy(TransposeMatrix[1], gpuTransposeMatrix[1], NUM * sizeof(float),
+              hipMemcpyDeviceToHost);

    hipDeviceSynchronize();

@@ -215,22 +197,22 @@ int main(){
    int errors = 0;
    double eps = 1.0E-6;
    for (int i = 0; i < NUM; i++) {
-        if (std::abs(randArray[i] - TransposeMatrix[1][i]) > eps ) {
-        printf("%d cpu: %f gpu peered data  %f\n",i,randArray[i],TransposeMatrix[1][i]);
-        errors++;
+        if (std::abs(randArray[i] - TransposeMatrix[1][i]) > eps) {
+            printf("%d cpu: %f gpu peered data  %f\n", i, randArray[i], TransposeMatrix[1][i]);
+            errors++;
        }
    }
-    if (errors!=0) {
-        printf("FAILED: %d errors\n",errors);
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
    } else {
-        printf ("Peer2Peer PASSED!\n");
+        printf("Peer2Peer PASSED!\n");
    }

    free(randArray);
-    for(int i=0;i<2;i++){
-       hipFree(data[i]);
-       hipFree(gpuTransposeMatrix[i]);
-       free(TransposeMatrix[i]);
+    for (int i = 0; i < 2; i++) {
+        hipFree(data[i]);
+        hipFree(gpuTransposeMatrix[i]);
+        free(TransposeMatrix[i]);
    }

    HIPCHECK(hipSetDevice(peerGpu));
@@ -20,122 +20,106 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include<iostream>
+#include <iostream>

 // hip header file
 #include "hip/hip_runtime.h"


-#define WIDTH     4
+#define WIDTH 4

-#define NUM       (WIDTH*WIDTH)
+#define NUM (WIDTH * WIDTH)

-#define THREADS_PER_BLOCK_X  4
-#define THREADS_PER_BLOCK_Y  4
-#define THREADS_PER_BLOCK_Z  1
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
-{
+__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    float val = in[x];

 #pragma unroll
-    for(int i=0;i<width;i++)
-    {
-        for(int j=0;j<width;j++)
-            out[i*width + j] = __shfl(val,j*width + i);
+    for (int i = 0; i < width; i++) {
+        for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i);
    }
 }

 // CPU implementation of matrix transpose
-void matrixTransposeCPUReference(
-    float * output,
-    float * input,
-    const unsigned int width)
-{
-    for(unsigned int j=0; j < width; j++)
-    {
-        for(unsigned int i=0; i < width; i++)
-        {
-            output[i*width + j] = input[j*width + i];
+void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
        }
    }
 }

 int main() {
+    float* Matrix;
+    float* TransposeMatrix;
+    float* cpuTransposeMatrix;

-  float* Matrix;
-  float* TransposeMatrix;
-  float* cpuTransposeMatrix;
+    float* gpuMatrix;
+    float* gpuTransposeMatrix;

-  float* gpuMatrix;
-  float* gpuTransposeMatrix;
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);

-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
+    std::cout << "Device name " << devProp.name << std::endl;

-  std::cout << "Device name " << devProp.name << std::endl;
+    int i;
+    int errors;

-  int i;
-  int errors;
+    Matrix = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
-
-  // initialize the input data
-  for (i = 0; i < NUM; i++) {
-    Matrix[i] = (float)i*10.0f;
-  }
-
-  // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
-
-  // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  errors = 0;
-  double eps = 1.0E-6;
-  for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
-      errors++;
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (float)i * 10.0f;
    }
-  }
-  if (errors!=0) {
-    printf("FAILED: %d errors\n",errors);
-  } else {
-    printf ("PASSED!\n");
-  }

-  //free the resources on device side
-  hipFree(gpuMatrix);
-  hipFree(gpuTransposeMatrix);
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

-  //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

-  return errors;
+    // Lauching kernel from host
+    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+                    gpuTransposeMatrix, gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+            printf("%d cpu: %f gpu  %f\n", i, cpuTransposeMatrix[i], TransposeMatrix[i]);
+            errors++;
+        }
+    }
+    if (errors != 0) {
+        printf("FAILED: %d errors\n", errors);
+    } else {
+        printf("PASSED!\n");
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    return errors;
 }
--- a/Show More
+++ b/Show More