From 6ecd303d94fe0d653ee57af06cb66baf19311f1a Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Thu, 28 Nov 2019 14:00:48 -0500
Subject: [PATCH 01/23] add error checking for code object loading

---
 src/program_state.inl | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/src/program_state.inl b/src/program_state.inl
index 993418de96..3a1052ccad 100644
--- a/src/program_state.inl
+++ b/src/program_state.inl
@@ -398,13 +398,19 @@ public:
                                            move(file), move(tmp));
         }
 
-        hsa_code_object_reader_create_from_memory(
-            it->first.data(), it->first.size(), it->second.get());
+        auto check_hsa_error = [](hsa_status_t s) {
+            if (s != HSA_STATUS_SUCCESS) {
+                hip_throw(std::runtime_error{"error when loading code object"});
+            }
+        };
 
-        hsa_executable_load_agent_code_object(
-            executable, agent, *it->second, nullptr, nullptr);
+        check_hsa_error(hsa_code_object_reader_create_from_memory(
+            it->first.data(), it->first.size(), it->second.get()));
 
-        hsa_executable_freeze(executable, nullptr);
+        check_hsa_error(hsa_executable_load_agent_code_object(
+            executable, agent, *it->second, nullptr, nullptr));
+
+        check_hsa_error(hsa_executable_freeze(executable, nullptr));
     }
 
 

From 2eb7ae4f0cd3633e71cb1fb99da043888e8e9c8c Mon Sep 17 00:00:00 2001
From: Rahul Garg <garg.rahul@gmail.com>
Date: Wed, 22 Jan 2020 14:37:47 -0800
Subject: [PATCH 02/23] Simplify hipDispatchLatency sample (#1793)

* Use hipExtLaunchKernelGGL in dispatchlatency sample

* Let it run on NVCC path too

* Refactoring

* Add test_kernel source

* Remove ResultDB

* Remove error checks
---
 samples/1_Utils/hipDispatchLatency/Makefile   |  13 +-
 .../hipDispatchLatency/ResultDatabase.cpp     | 473 ------------------
 .../hipDispatchLatency/ResultDatabase.h       |  89 ----
 .../hipDispatchLatency/hipDispatchLatency.cpp | 231 ++++-----
 .../hipDispatchLatency/test_kernel.cpp        |  24 +
 5 files changed, 140 insertions(+), 690 deletions(-)
 delete mode 100644 samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp
 delete mode 100644 samples/1_Utils/hipDispatchLatency/ResultDatabase.h
 create mode 100644 samples/1_Utils/hipDispatchLatency/test_kernel.cpp

diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile
index 3b69c4a335..0616f01f0d 100644
--- a/samples/1_Utils/hipDispatchLatency/Makefile
+++ b/samples/1_Utils/hipDispatchLatency/Makefile
@@ -2,19 +2,18 @@ HIP_PATH?= $(wildcard /opt/rocm/hip)
 ifeq (,$(HIP_PATH))
 	HIP_PATH=../../..
 endif
-HIPCC=$(HIP_PATH)/bin/hipcc
+HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11
 
 EXE=hipDispatchLatency
 
 CXXFLAGS = -O3
 
-all: ${EXE}
+all: test_kernel.code ${EXE}
 
-$(EXE): hipDispatchLatency.cpp ResultDatabase.cpp
-	$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp ResultDatabase.cpp -o $@
-
-install: $(EXE)
-	cp $(EXE) $(HIP_PATH)/bin
+$(EXE): hipDispatchLatency.cpp
+	$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@
 
+test_kernel.code: test_kernel.cpp
+	$(HIP_PATH)/bin/hipcc --genco  $(GENCO_FLAGS) $^ -o $@
 clean:
 	rm -f *.o $(EXE)
diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp
deleted file mode 100644
index b769ca4b32..0000000000
--- a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp
+++ /dev/null
@@ -1,473 +0,0 @@
-#include "ResultDatabase.h"
-
-#include <cfloat>
-#include <algorithm>
-#include <cmath>
-#include <iomanip>
-
-using namespace std;
-
-#define SORT_BY_NAME 0
-#define SORT_RETAIN_ATTS_ORDER 1
-
-
-bool ResultDatabase::Result::operator<(const Result& rhs) const {
-    if (test < rhs.test) return true;
-    if (test > rhs.test) return false;
-#if (SORT_RETAIN_ATTS_ORDER == 0)
-    // For ties, sort by the value of the attribute:
-    if (atts < rhs.atts) return true;
-    if (atts > rhs.atts) return false;
-#endif
-    return false;  // less-operator returns false on equal
-}
-
-double ResultDatabase::Result::GetMin() const {
-    double r = FLT_MAX;
-    for (int i = 0; i < value.size(); i++) {
-        r = min(r, value[i]);
-    }
-    return r;
-}
-
-double ResultDatabase::Result::GetMax() const {
-    double r = -FLT_MAX;
-    for (int i = 0; i < value.size(); i++) {
-        r = max(r, value[i]);
-    }
-    return r;
-}
-
-double ResultDatabase::Result::GetMedian() const { return GetPercentile(50); }
-
-double ResultDatabase::Result::GetPercentile(double q) const {
-    int n = value.size();
-    if (n == 0) return FLT_MAX;
-    if (n == 1) return value[0];
-
-    if (q <= 0) return value[0];
-    if (q >= 100) return value[n - 1];
-
-    double index = ((n + 1.) * q / 100.) - 1;
-
-    vector<double> sorted = value;
-    sort(sorted.begin(), sorted.end());
-
-    if (n == 2) return (sorted[0] * (1 - q / 100.) + sorted[1] * (q / 100.));
-
-    int index_lo = int(index);
-    double frac = index - index_lo;
-    if (frac == 0) return sorted[index_lo];
-
-    double lo = sorted[index_lo];
-    double hi = sorted[index_lo + 1];
-    return lo + (hi - lo) * frac;
-}
-
-double ResultDatabase::Result::GetMean() const {
-    double r = 0;
-    for (int i = 0; i < value.size(); i++) {
-        r += value[i];
-    }
-    return r / double(value.size());
-}
-
-double ResultDatabase::Result::GetStdDev() const {
-    double r = 0;
-    double u = GetMean();
-    if (u == FLT_MAX) return FLT_MAX;
-    for (int i = 0; i < value.size(); i++) {
-        r += (value[i] - u) * (value[i] - u);
-    }
-    r = sqrt(r / value.size());
-    return r;
-}
-
-
-void ResultDatabase::AddResults(const string& test, const string& atts, const string& unit,
-                                const vector<double>& values) {
-    for (int i = 0; i < values.size(); i++) {
-        AddResult(test, atts, unit, values[i]);
-    }
-}
-
-static string RemoveAllButLeadingSpaces(const string& a) {
-    string b;
-    int n = a.length();
-    int i = 0;
-    while (i < n && a[i] == ' ') {
-        b += a[i];
-        ++i;
-    }
-    for (; i < n; i++) {
-        if (a[i] != ' ' && a[i] != '\t') b += a[i];
-    }
-    return b;
-}
-
-void ResultDatabase::AddResult(const string& test_orig, const string& atts_orig,
-                               const string& unit_orig, double value) {
-    string test = RemoveAllButLeadingSpaces(test_orig);
-    string atts = RemoveAllButLeadingSpaces(atts_orig);
-    string unit = RemoveAllButLeadingSpaces(unit_orig);
-    int index;
-    for (index = 0; index < results.size(); index++) {
-        if (results[index].test == test && results[index].atts == atts) {
-            if (results[index].unit != unit) throw "Internal error: mixed units";
-
-            break;
-        }
-    }
-
-    if (index >= results.size()) {
-        Result r;
-        r.test = test;
-        r.atts = atts;
-        r.unit = unit;
-        results.push_back(r);
-    }
-
-    results[index].value.push_back(value);
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::DumpDetailed
-//
-//  Purpose:
-//    Writes the full results, including all trials.
-//
-//  Arguments:
-//    out        where to print
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    August 14, 2009
-//
-//  Modifications:
-//    Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010
-//    Renamed to DumpDetailed to make room for a DumpSummary.
-//
-//    Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
-//    Added note about (*) missing value tag.
-//
-//    Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010
-//    Changed note about missing values to be worded a little better.
-//
-// ****************************************************************************
-void ResultDatabase::DumpDetailed(ostream& out) {
-    vector<Result> sorted(results);
-
-#if SORT_BY_NAME
-    stable_sort(sorted.begin(), sorted.end());
-#endif
-
-    const int testNameW = 24;
-    const int attW = 12;
-    const int fieldW = 11;
-    out << std::fixed << right << std::setprecision(4);
-
-    int maxtrials = 1;
-    for (int i = 0; i < sorted.size(); i++) {
-        if (sorted[i].value.size() > maxtrials) maxtrials = sorted[i].value.size();
-    }
-
-    // TODO: in big parallel runs, the "trials" are the procs
-    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "median\t"
-        << "mean\t"
-        << "stddev\t"
-        << "min\t"
-        << "max\t";
-    for (int i = 0; i < maxtrials; i++) out << "trial" << i << "\t";
-    out << endl;
-
-    for (int i = 0; i < sorted.size(); i++) {
-        Result& r = sorted[i];
-        out << setw(testNameW) << r.test + "\t";
-        out << setw(attW) << r.atts + "\t";
-        out << setw(fieldW) << r.unit + "\t";
-        if (r.GetMedian() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMedian() << "\t";
-        if (r.GetMean() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMean() << "\t";
-        if (r.GetStdDev() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetStdDev() << "\t";
-        if (r.GetMin() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMin() << "\t";
-        if (r.GetMax() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMax() << "\t";
-        for (int j = 0; j < r.value.size(); j++) {
-            if (r.value[j] == FLT_MAX)
-                out << "N/A\t";
-            else
-                out << r.value[j] << "\t";
-        }
-
-        out << endl;
-    }
-    if (0) {
-        out << endl
-            << "Note: Any results marked with (*) had missing values." << endl
-            << "      This can occur on systems with a mixture of" << endl
-            << "      device types or architectural capabilities." << endl;
-    }
-}
-
-
-// ****************************************************************************
-//  Method:  ResultDatabase::DumpDetailed
-//
-//  Purpose:
-//    Writes the summary results (min/max/stddev/med/mean), but not
-//    every individual trial.
-//
-//  Arguments:
-//    out        where to print
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    November 10, 2010
-//
-//  Modifications:
-//    Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
-//    Added note about (*) missing value tag.
-//
-// ****************************************************************************
-void ResultDatabase::DumpSummary(ostream& out) {
-    vector<Result> sorted(results);
-
-#if SORT_BY_NAME
-    stable_sort(sorted.begin(), sorted.end());
-#endif
-
-    const int testNameW = 32;
-    const int attW = 12;
-    const int fieldW = 9;
-    out << std::fixed << right << std::setprecision(2);
-
-    // TODO: in big parallel runs, the "trials" are the procs
-    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "units\t"
-        << "median\t"
-        << "mean\t"
-        << "stddev\t"
-        << "min\t"
-        << "max\t";
-    out << endl;
-
-    for (int i = 0; i < sorted.size(); i++) {
-        Result& r = sorted[i];
-        out << setw(testNameW) << r.test + "\t";
-        out << setw(attW) << r.atts + "\t";
-        out << setw(fieldW) << r.unit + "\t";
-        if (r.GetMedian() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMedian() << "\t";
-        if (r.GetMean() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMean() << "\t";
-        if (r.GetStdDev() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetStdDev() << "\t";
-        if (r.GetMin() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMin() << "\t";
-        if (r.GetMax() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMax() << "\t";
-
-        out << endl;
-    }
-    if (0) {
-        out << endl
-            << "Note: results marked with (*) had missing values such as" << endl
-            << "might occur with a mixture of architectural capabilities." << endl;
-    }
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::ClearAllResults
-//
-//  Purpose:
-//    Clears all existing results from the ResultDatabase; used for multiple passes
-//    of the same test or multiple tests.
-//
-//  Arguments:
-//
-//  Programmer:  Jeffrey Young
-//  Creation:    September 10th, 2014
-//
-//  Modifications:
-//
-//
-// ****************************************************************************
-void ResultDatabase::ClearAllResults() { results.clear(); }
-
-// ****************************************************************************
-//  Method:  ResultDatabase::DumpCsv
-//
-//  Purpose:
-//    Writes either detailed or summary results (min/max/stddev/med/mean), but not
-//    every individual trial.
-//
-//  Arguments:
-//    out        file to print CSV results
-//
-//  Programmer:  Jeffrey Young
-//  Creation:    August 28th, 2014
-//
-//  Modifications:
-//
-// ****************************************************************************
-void ResultDatabase::DumpCsv(string fileName) {
-    bool emptyFile;
-    vector<Result> sorted(results);
-
-#if SORT_BY_NAME
-    stable_sort(sorted.begin(), sorted.end());
-#endif
-
-    // Check to see if the file is empty - if so, add the headers
-    emptyFile = this->IsFileEmpty(fileName);
-
-    // Open file and append by default
-    ofstream out;
-    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app);
-
-    // Add headers only for empty files
-    if (emptyFile) {
-        // TODO: in big parallel runs, the "trials" are the procs
-        // and we really don't want to print them all out....
-        out << "test, "
-            << "atts, "
-            << "units, "
-            << "median, "
-            << "mean, "
-            << "stddev, "
-            << "min, "
-            << "max, ";
-        out << endl;
-    }
-
-    for (int i = 0; i < sorted.size(); i++) {
-        Result& r = sorted[i];
-        out << r.test << ", ";
-        out << r.atts << ", ";
-        out << r.unit << ", ";
-        if (r.GetMedian() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMedian() << ", ";
-        if (r.GetMean() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMean() << ", ";
-        if (r.GetStdDev() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetStdDev() << ", ";
-        if (r.GetMin() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMin() << ", ";
-        if (r.GetMax() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMax() << ", ";
-
-        out << endl;
-    }
-    out << endl;
-
-    out.close();
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::IsFileEmpty
-//
-//  Purpose:
-//    Returns whether a file is empty - used as a helper for CSV printing
-//
-//  Arguments:
-//    file  The input file to check for emptiness
-//
-//  Programmer:  Jeffrey Young
-//  Creation:    August 28th, 2014
-//
-//  Modifications:
-//
-// ****************************************************************************
-
-bool ResultDatabase::IsFileEmpty(string fileName) {
-    bool fileEmpty;
-
-    ifstream file(fileName.c_str());
-
-    // If the file doesn't exist it is by definition empty
-    if (!file.good()) {
-        return true;
-    } else {
-        fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
-        file.close();
-
-        return fileEmpty;
-    }
-
-    // Otherwise, return false
-    return false;
-}
-
-
-// ****************************************************************************
-//  Method:  ResultDatabase::GetResultsForTest
-//
-//  Purpose:
-//    Returns a vector of results for just one test name.
-//
-//  Arguments:
-//    test       the name of the test results to search for
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    December  3, 2010
-//
-//  Modifications:
-//
-// ****************************************************************************
-vector<ResultDatabase::Result> ResultDatabase::GetResultsForTest(const string& test) {
-    // get only the given test results
-    vector<Result> retval;
-    for (int i = 0; i < results.size(); i++) {
-        Result& r = results[i];
-        if (r.test == test) retval.push_back(r);
-    }
-    return retval;
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::GetResults
-//
-//  Purpose:
-//    Returns all the results.
-//
-//  Arguments:
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    December  3, 2010
-//
-//  Modifications:
-//
-// ****************************************************************************
-const vector<ResultDatabase::Result>& ResultDatabase::GetResults() const { return results; }
diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.h b/samples/1_Utils/hipDispatchLatency/ResultDatabase.h
deleted file mode 100644
index ca6a00fc91..0000000000
--- a/samples/1_Utils/hipDispatchLatency/ResultDatabase.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef RESULT_DATABASE_H
-#define RESULT_DATABASE_H
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <cfloat>
-using std::ifstream;
-using std::ofstream;
-using std::ostream;
-using std::string;
-using std::vector;
-
-
-// ****************************************************************************
-// Class:  ResultDatabase
-//
-// Purpose:
-//   Track numerical results as they are generated.
-//   Print statistics of raw results.
-//
-// Programmer:  Jeremy Meredith
-// Creation:    June 12, 2009
-//
-// Modifications:
-//    Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
-//    Split timing reports into detailed and summary.  E.g. for serial code,
-//    we might report all trial values, but skip them in parallel.
-//
-//    Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010
-//    Added check for missing value tag.
-//
-//    Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010
-//    Added percentile statistic.
-//
-//    Jeremy Meredith, Fri Dec  3 16:30:31 EST 2010
-//    Added a method to extract a subset of results based on test name.  Also,
-//    the Result class is now public, so that clients can use them directly.
-//    Added a GetResults method as well, and made several functions const.
-//
-// ****************************************************************************
-class ResultDatabase {
-   public:
-    //
-    // A performance result for a single SHOC benchmark run.
-    //
-    struct Result {
-        string test;           // e.g. "readback"
-        string atts;           // e.g. "pagelocked 4k^2"
-        string unit;           // e.g. "MB/sec"
-        vector<double> value;  // e.g. "837.14"
-        double GetMin() const;
-        double GetMax() const;
-        double GetMedian() const;
-        double GetPercentile(double q) const;
-        double GetMean() const;
-        double GetStdDev() const;
-
-        bool operator<(const Result& rhs) const;
-
-        bool HadAnyFLTMAXValues() const {
-            for (int i = 0; i < value.size(); ++i) {
-                if (value[i] >= FLT_MAX) return true;
-            }
-            return false;
-        }
-    };
-
-   protected:
-    vector<Result> results;
-
-   public:
-    void AddResult(const string& test, const string& atts, const string& unit, double value);
-    void AddResults(const string& test, const string& atts, const string& unit,
-                    const vector<double>& values);
-    vector<Result> GetResultsForTest(const string& test);
-    const vector<Result>& GetResults() const;
-    void ClearAllResults();
-    void DumpDetailed(ostream&);
-    void DumpSummary(ostream&);
-    void DumpCsv(string fileName);
-
-   private:
-    bool IsFileEmpty(string fileName);
-};
-
-
-#endif
diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
index 9d384c7d6a..625d8cd742 100644
--- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
+++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
@@ -1,16 +1,13 @@
 /*
 Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
-
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
-
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -21,142 +18,134 @@ THE SOFTWARE.
 */
 
 #include "hip/hip_runtime.h"
+#ifdef __HIP_PLATFORM_HCC__
+#include "hip/hip_ext.h"
+#endif
 #include <iostream>
-#include <time.h>
-#include "ResultDatabase.h"
-
-#define PRINT_PROGRESS 0
-
-#define check(cmd)                                                                                 \
-    {                                                                                              \
-        hipError_t status = cmd;                                                                   \
-        if (status != hipSuccess) {                                                                \
-            printf("error: '%s'(%d) from %s at %s:%d\n", hipGetErrorString(status), status, #cmd,  \
-                   __FILE__, __LINE__);                                                            \
-            abort();                                                                               \
-        }                                                                                          \
-    }
-
-#define LEN 1024 * 1024
+#include <chrono>
+#include <algorithm>
 
 #define NUM_GROUPS 1
-#define GROUP_SIZE 64
-#define TEST_ITERS 20
-#define DISPATCHES_PER_TEST 100
+#define GROUP_SIZE 1
+#define WARMUP_RUN_COUNT 10
+#define TIMING_RUN_COUNT 100
+#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
+#define BATCH_SIZE 1000
 
-const unsigned p_tests = 0xfffffff;
+#define FILE_NAME "test_kernel.code"
+#define KERNEL_NAME "test"
 
+__global__ void EmptyKernel() { }
 
-// HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(float* Ad) {
-    if (Ad) {
-        Ad[0] = 42;
-    }
+void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1) {
+    
+    float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
+    
+    // skip warm-up runs
+    auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
+    auto end_iter = results.end();
+
+    // mean
+    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
+        total_us += (run_ms * 1000) / batch;
+    });   
+    mean_us = total_us  / TIMING_RUN_COUNT;
+
+   // stddev
+    total_us = 0;
+    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
+        float dev_us = ((run_ms * 1000) / batch) - mean_us;
+        total_us += dev_us * dev_us;
+    });
+    stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
+
+    // display
+    printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
 }
 
-
-ResultDatabase resultDB;
-
-
-void stopTest(hipEvent_t start, hipEvent_t stop, const char* msg, int iters) {
-    float mS = 0;
-    check(hipEventRecord(stop));
-    check(hipDeviceSynchronize());
-    check(hipEventElapsedTime(&mS, start, stop));
-    resultDB.AddResult(std::string(msg), "", "uS", mS * 1000 / iters);
-    if (PRINT_PROGRESS & 0x1) {
-        std::cout << msg << "\t\t" << mS * 1000 / iters << " uS" << std::endl;
-    }
-    if (PRINT_PROGRESS & 0x2) {
-        resultDB.DumpSummary(std::cout);
-    }
-}
-
-
-int main() {
-    hipError_t err;
-    float* Ad;
-    check(hipMalloc(&Ad, 4));
-
-
-    hipStream_t stream;
-    check(hipStreamCreate(&stream));
-
-
-    hipEvent_t start, sync, stop;
-    check(hipEventCreate(&start));
-    check(hipEventCreateWithFlags(&sync, hipEventBlockingSync));
-    check(hipEventCreate(&stop));
-
-
+int main() {   
     hipStream_t stream0 = 0;
+    hipDevice_t device;
+    hipDeviceGet(&device, 0);
+    hipCtx_t context;     
+    hipCtxCreate(&context, 0, device); 
+    hipModule_t module;
+    hipFunction_t function;
+    hipModuleLoad(&module, FILE_NAME);
+    hipModuleGetFunction(&function, module, KERNEL_NAME);
+    void* params = nullptr;
+    
+    std::array<float, TOTAL_RUN_COUNT> results;
+    hipEvent_t start, stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);
 
+    /************************************************************************************/
+    /* HIP kernel launch enqueue rate:                                                  */
+    /* Measure time taken to enqueue a kernel on the GPU                                */
+    /************************************************************************************/ 
 
-    if (p_tests & 0x1) {
-        hipEventRecord(start);
-        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-        stopTest(start, stop, "FirstKernelLaunch", 1);
+    // Timing hipModuleLaunchKernel
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, &params, nullptr);
+        auto stop = std::chrono::high_resolution_clock::now();
+        results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
     }
+    print_timing("hipModuleLaunchKernel enqueue rate", results);
 
-
-    if (p_tests & 0x2) {
-        hipEventRecord(start);
-        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-        stopTest(start, stop, "SecondKernelLaunch", 1);
+    // Timing hipLaunchKernelGGL
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
+        auto stop = std::chrono::high_resolution_clock::now();
+        results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
     }
+    print_timing("hipLaunchKernelGGL enqueue rate", results);
 
+    /***********************************************************************************/
+    /* Single dispatch execution latency using HIP events:                             */   
+    /* Measures latency to start & finish executing a kernel with GPU-scope visibility    */ 
+    /***********************************************************************************/
 
-    if (p_tests & 0x4) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-                hipEventRecord(sync);
-                hipEventSynchronize(sync);
-            }
-            stopTest(start, stop, "NullStreamASyncDispatchWait", DISPATCHES_PER_TEST);
-        }
-    }
-
-
-    if (p_tests & 0x10) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
-                hipEventRecord(sync);
-                hipEventSynchronize(sync);
-            }
-            stopTest(start, stop, "StreamASyncDispatchWait", DISPATCHES_PER_TEST);
-        }
-    }
-
-#if 1
-
-    if (p_tests & 0x40) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-            }
-            stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
-        }
-    }
-
-    if (p_tests & 0x80) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
-            }
-            stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
-        }
+    //Timing directly the dispatch
+#ifdef __HIP_PLATFORM_HCC__
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
+        hipEventSynchronize(stop);
+        hipEventElapsedTime(&results[i], start, stop);
     }
+    print_timing("Timing directly single dispatch latency", results);
 #endif
-    resultDB.DumpSummary(std::cout);
 
+    //Timing around the dispatch
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        hipEventRecord(start, 0);
+        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        hipEventElapsedTime(&results[i], start, stop);
+    }
+    print_timing("Timing around single dispatch latency", results);
 
-    check(hipEventDestroy(start));
-    check(hipEventDestroy(sync));
-    check(hipEventDestroy(stop));
+    /*********************************************************************************/
+    /* Batch dispatch execution latency using HIP events:                            */
+    /* Measures latency to start & finish executing each dispatch in a batch    */ 
+    /*********************************************************************************/
+
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+         hipEventRecord(start, 0);
+         for (int j = 0; j < BATCH_SIZE; j++) {
+             hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
+         }
+         hipEventRecord(stop, 0);
+         hipEventSynchronize(stop);
+         hipEventElapsedTime(&results[i], start, stop);
+    }
+    print_timing("Batch dispatch latency", results, BATCH_SIZE);
+
+    hipEventDestroy(start);
+    hipEventDestroy(stop);
+    hipCtxDestroy(context);
 }
+
diff --git a/samples/1_Utils/hipDispatchLatency/test_kernel.cpp b/samples/1_Utils/hipDispatchLatency/test_kernel.cpp
new file mode 100644
index 0000000000..23ef426730
--- /dev/null
+++ b/samples/1_Utils/hipDispatchLatency/test_kernel.cpp
@@ -0,0 +1,24 @@
+/*
+Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "hip/hip_runtime.h"
+
+extern "C" __global__ void test() {
+}
+

From 145334eda284e02bad61f207fd66e8cf173697ec Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Thu, 23 Jan 2020 16:37:25 +0300
Subject: [PATCH 03/23] [HIPIFY][cmake] Send error on CUDA 10.2 and higher as
 yet unsupported

---
 hipify-clang/CMakeLists.txt | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt
index 3394b98dce..470d42c6a5 100644
--- a/hipify-clang/CMakeLists.txt
+++ b/hipify-clang/CMakeLists.txt
@@ -155,30 +155,31 @@ if (HIPIFY_CLANG_TESTS)
     require_program(lit)
     require_program(FileCheck)
 
-    # Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
-    # value of --cuda-path for the test runs.
     find_package(CUDA REQUIRED)
     if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR
         (CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR
         (CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR
         (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0") OR
         (CUDA_VERSION VERSION_GREATER "9.2" AND LLVM_PACKAGE_VERSION VERSION_LESS "8.0") OR
-        (CUDA_VERSION VERSION_GREATER "10.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "9.0"))
-        message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.")
+        (CUDA_VERSION VERSION_GREATER "10.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "9.0") OR
+        (CUDA_VERSION VERSION_GREATER "10.1"))
+        message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by LLVM ${LLVM_PACKAGE_VERSION}.")
         if (CUDA_VERSION_MAJOR VERSION_LESS "7")
             message(STATUS "Please install CUDA 7.0 or higher.")
         elseif (CUDA_VERSION_MAJOR VERSION_LESS "8")
-            message(STATUS "Please install clang 3.8 or higher.")
+            message(STATUS "Please install LLVM + clang 3.8 or higher.")
         elseif (CUDA_VERSION_MAJOR VERSION_LESS "9")
-            message(STATUS "Please install clang 4.0 or higher.")
+            message(STATUS "Please install LLVM + clang 4.0 or higher.")
         elseif (CUDA_VERSION VERSION_EQUAL "9.0")
-            message(STATUS "Please install clang 6.0 or higher.")
+            message(STATUS "Please install LLVM + clang 6.0 or higher.")
         elseif (CUDA_VERSION_MAJOR VERSION_LESS "10")
-            message(STATUS "Please install clang 7.0 or higher.")
+            message(STATUS "Please install LLVM + clang 7.0 or higher.")
         elseif (CUDA_VERSION VERSION_EQUAL "10.0")
-            message(STATUS "Please install clang 8.0 or higher.")
+            message(STATUS "Please install LLVM + clang 8.0 or higher.")
         elseif (CUDA_VERSION VERSION_EQUAL "10.1")
-            message(STATUS "Please install clang 9.0 or higher.")
+            message(STATUS "Please install LLVM + clang 9.0 or higher.")
+        elseif (CUDA_VERSION VERSION_GREATER "10.1")
+            message(STATUS "Please install CUDA 10.1 or lesser.")
         endif()
     endif()
 

From e90e6e59c32635e5d589b94d6347e10b0632aa9e Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Thu, 23 Jan 2020 17:47:11 +0300
Subject: [PATCH 04/23] [HIPIFY][cmake] Formatting

---
 hipify-clang/CMakeLists.txt | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt
index 470d42c6a5..fcc48c2088 100644
--- a/hipify-clang/CMakeLists.txt
+++ b/hipify-clang/CMakeLists.txt
@@ -115,10 +115,8 @@ install(
   PATTERN "complex"
   PATTERN "new"
   PATTERN "ppc_wrappers" EXCLUDE
-  PATTERN "openmp_wrappers" EXCLUDE
-  )
+  PATTERN "openmp_wrappers" EXCLUDE)
 
-# Package: hipify-clang
 if (UNIX)
     set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hipify-clang)
     configure_file(packaging/hipify-clang.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
@@ -132,8 +130,8 @@ if (UNIX)
 
     file(GENERATE OUTPUT ${PROJECT_BINARY_DIR}/fixnames
         CONTENT "pwd; for i in *.deb; do mv \"\$i\" \"\${i/.deb/-amd64.deb}\" ; done
-        for i in *.rpm ; do mv \$i \${i/.rpm/.x86_64.rpm} ; done
-        ")
+        for i in *.rpm ; do mv \$i \${i/.rpm/.x86_64.rpm} ; done")
+
     add_custom_target(package_hipify-clang
         COMMAND bash ${PROJECT_BINARY_DIR}/fixnames
         WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
@@ -186,15 +184,13 @@ if (HIPIFY_CLANG_TESTS)
     configure_file(
         ${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang/lit.site.cfg.in
         ${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
-        @ONLY
-    )
+        @ONLY)
 
     add_lit_testsuite(test-hipify "Running HIPify regression tests"
         ${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang
         PARAMS site_config=${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
         ARGS -v
-        DEPENDS hipify-clang
-    )
+        DEPENDS hipify-clang)
 
     add_custom_target(test-hipify-clang)
     add_dependencies(test-hipify-clang test-hipify)

From 78255e4d4f3f066ec8e5139c30f704f595d6fbec Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Thu, 23 Jan 2020 18:01:48 +0300
Subject: [PATCH 05/23] [HIPIFY][doc] Readme - update versions

---
 hipify-clang/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hipify-clang/README.md b/hipify-clang/README.md
index f0f90bd2a1..c9c138e900 100644
--- a/hipify-clang/README.md
+++ b/hipify-clang/README.md
@@ -357,7 +357,7 @@ LLVM 7.0.0 - 9.0.1, CUDA 7.5 - 10.1 Update 2, cudnn 7.0.5.15 - 7.6.5.32
 
 Build system requirements for the latest configuration LLVM 9.0.1/CUDA 10.1 Update 2:
 
-Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.2, Visual Studio 2017 (15.5.2) - 2019 (16.4.2).
+Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.3).
 
 Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
 
@@ -438,5 +438,5 @@ The information contained herein is for informational purposes only, and is subj
 
 AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
 
-Copyright (c) 2014-2019 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2014-2020 Advanced Micro Devices, Inc. All rights reserved.
 

From 604c09f428576675e3a0ef416af9db6d978f54c3 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Thu, 23 Jan 2020 20:55:10 +0300
Subject: [PATCH 06/23] [HIP][HIPIFY] Readmes actualization

---
 README.md              |  31 +++++-----
 hipify-clang/README.md | 135 +++++++++++++++++++++++++++--------------
 2 files changed, 104 insertions(+), 62 deletions(-)

diff --git a/README.md b/README.md
index 145f950fef..2bffd12162 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ Key features include:
 * HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode.
 * HIP allows coding in a single-source C++ programming language including features such as templates, C++11 lambdas, classes, namespaces, and more.
 * HIP allows developers to use the "best" development environment and tools on each target platform.
-* The "hipify" tool automatically converts source from CUDA to HIP.
+* The [HIPIFY](hipify-clang/README.md) tools automatically convert source from CUDA to HIP.
 * Developers can specialize for the platform (CUDA or hcc) to tune for performance or handle tricky cases 
 
 New projects can be developed directly in the portable HIP C++ language and can run on either NVIDIA or AMD platforms.  Additionally, HIP provides porting tools which make it easy to port existing CUDA codes to the HIP layer, with no loss of performance as compared to the original CUDA application.  HIP is not intended to be a drop-in replacement for CUDA, and developers should expect to do some manual coding and performance tuning work to complete the port.
@@ -36,7 +36,7 @@ HIP releases are typically of two types. The tag naming convention is different
 - [HIP Profiling ](docs/markdown/hip_profiling.md)
 - [HIP Debugging](docs/markdown/hip_debugging.md)
 - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenCL)
-- [hipify-clang](hipify-clang/README.md)
+- [HIPIFY](hipify-clang/README.md)
 - Supported CUDA APIs:
   * [Runtime API](docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
   * [Driver API](docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
@@ -71,7 +71,7 @@ hipLaunchKernel(vector_square,   /* compute kernel*/
                 dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/,     /* launch config*/
                 C_d, A_d, N);  /* arguments to the compute kernel */
 
-hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); 
+hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
 ```
 
 
@@ -88,7 +88,7 @@ __global__ void
 vector_square(T *C_d, const T *A_d, size_t N)
 {
     size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t stride = hipBlockDim_x * hipGridDim_x;
 
     for (size_t i=offset; i<N; i+=stride) {
         C_d[i] = A_d[i] * A_d[i];
@@ -100,12 +100,12 @@ The HIP Runtime API code and compute kernel definition can exist in the same sou
 
 ## HIP Portability and Compiler Technology
 HIP C++ code can be compiled with either :
-- On the NVIDIA CUDA platform, HIP provides header file which translate from the HIP runtime APIs to CUDA runtime APIs.  The header file contains mostly inlined 
+- On the NVIDIA CUDA platform, HIP provides header file which translate from the HIP runtime APIs to CUDA runtime APIs.  The header file contains mostly inlined
   functions and thus has very low overhead - developers coding in HIP should expect the same performance as coding in native CUDA.  The code is then
   compiled with nvcc, the standard C++ compiler provided with the CUDA SDK.  Developers can use any tools supported by the CUDA SDK including the CUDA
   profiler and debugger.
-- On the AMD ROCm platform, HIP provides a header and runtime library built on top of hcc compiler.  The HIP runtime implements HIP streams, events, and memory APIs, 
-  and is a object library that is linked with the application.  The source code for all headers and the library implementation is available on GitHub.  
+- On the AMD ROCm platform, HIP provides a header and runtime library built on top of hcc compiler.  The HIP runtime implements HIP streams, events, and memory APIs,
+  and is a object library that is linked with the application.  The source code for all headers and the library implementation is available on GitHub.
   HIP developers on ROCm can use AMD's CodeXL for debugging and profiling.
 
 Thus HIP source code can be compiled to run on either platform.  Platform-specific features can be isolated to a specific platform using conditional compilation.  Thus HIP
@@ -114,7 +114,7 @@ provides source portability to either platform.   HIP provides the _hipcc_ compi
 
 ## Examples and Getting Started:
 
-* A sample and [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that uses hipify to convert a simple app from CUDA to HIP:
+* A sample and [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that uses any of [HIPIFY](hipify-clang/README.md) tools to convert a simple app from CUDA to HIP:
 
  
 ```shell
@@ -136,19 +136,20 @@ The GitHub repository [HIP-Examples](https://github.com/ROCm-Developer-Tools/HIP
 The README with the procedures and tips the team used during this porting effort is here: [Rodinia Porting Guide](https://github.com/ROCm-Developer-Tools/HIP-Examples/blob/master/rodinia_3.0/hip/README.hip_porting)
 
 ## Tour of the HIP Directories
-* **include**: 
+* **include**:
     * **hip_runtime_api.h** : Defines HIP runtime APIs and can be compiled with many standard Linux compilers (hcc, GCC, ICC, CLANG, etc), in either C or C++ mode.
     * **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernel and syntax for writing device kernels and device functions.  hip_runtime.h can only be compiled with hcc.
-    * **hcc_detail/**** , **nvcc_detail/**** : Implementation details for specific platforms.  HIP applications should not include these files directly.
+    * **hcc_detail/**** , **nvcc_detail/**** : Implementation details for specific platforms. HIP applications should not include these files directly.
     * **hcc.h** : Includes interop APIs for HIP and HCC
     
 * **bin**: Tools and scripts to help with hip porting
-    * **hipify** : Tool to convert CUDA code to portable CPP.  Converts CUDA APIs and kernel builtins.  
-    * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform, and include appropriate platform-specific headers and libraries.
-    * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc)
-    * **hipexamine.sh** : Script to scan directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported)
+    * **hipify-perl** : Script based tool to convert CUDA code to portable CPP. Converts CUDA APIs and kernel builtins.
+    * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform and include appropriate platform-specific headers and libraries.
+    * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc.)
+    * **hipexamine-perl.sh** : Script to scan the directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported).
+    * **hipconvertinplace-perl.sh** : Script to scan the directory, find all code, and convert the found CUDA code to HIP reporting all unconverted things.
 
-* **doc**: Documentation - markdown and doxygen info
+* **doc**: Documentation - markdown and doxygen info.
 
 ## Reporting an issue
 Use the [GitHub issue tracker](https://github.com/ROCm-Developer-Tools/HIP/issues).
diff --git a/hipify-clang/README.md b/hipify-clang/README.md
index c9c138e900..6607be198c 100644
--- a/hipify-clang/README.md
+++ b/hipify-clang/README.md
@@ -1,11 +1,13 @@
-# hipify-clang
-
-`hipify-clang` is a clang-based tool to translate CUDA source code into portable HIP C++ automatically.
-
+# HIPIFY
+### Tools to translate CUDA source code into portable HIP C++ automatically
 ## Table of Contents
 
 <!-- toc -->
 
+- [hipify-clang](#clang)
+     * [Usage](#hipify-clang-usage)
+- [hipify-perl](#perl)
+     * [Usage](#hipify-perl-usage)
 - [Supported CUDA APIs](#cuda-apis)
 - [Dependencies](#dependencies)
 - [Build and install](#build-and-install)
@@ -13,12 +15,92 @@
      * [Testing](#testing)
      * [Linux](#linux)
      * [Windows](#windows)
-- [Running and using hipify-clang](#running-and-using-hipify-clang)
-     * [hipify-perl](#perl)
 - [Disclaimer](#disclaimer)
 
 <!-- tocstop -->
 
+## <a name="clang"></a> hipify-clang
+
+`hipify-clang` is a clang-based tool for translation CUDA sources into HIP sources.
+It translates CUDA source into an abstract syntax tree, which is being traversed by transformation matchers.
+After applying all the matchers, the output HIP source is produced.
+
+**Advantages:**
+
+1. It is a translator; thus, any even very complicated constructs will be parsed successfully, or an error will be reported.
+2. It supports clang options like -I, -D, --cuda-path, etc.
+3. Seamless support of new CUDA versions as it is clang's responsibility.
+4. Ease in support.
+
+**Disadvantages:**
+
+1. The main advantage is also the main disadvantage: the input CUDA code should be correct; incorrect code wouldn't be translated to HIP.
+2. CUDA should be installed and provided in case of multiple installations by --cuda-path option.
+3. All the includes and defines should be provided to transform code successfully.
+
+### <a name="hipify-clang-usage"></a> hipify-clang usage
+
+To process a file, `hipify-clang` needs access to the same headers that would be required to compile it with clang.
+
+For example:
+
+```shell
+./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
+```
+
+`hipify-clang` arguments are given first, followed by a separator '--', and then the arguments you'd pass to `clang` if you
+were compiling the input file. For example:
+
+```shell
+./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
+```
+
+The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
+
+For a list of `hipify-clang` options, run `hipify-clang --help`.
+
+## <a name="perl"></a> hipify-perl
+
+`hipify-perl` is autogenerated perl-based script which heavily uses regular expressions.
+
+**Advantages:**
+
+1. Ease in use.
+
+2. It doesn't check the input source CUDA code for correctness.
+
+3. It doesn't have dependencies on 3rd party tools, including CUDA.
+
+**Disadvantages:**
+
+1. Current disability (and difficulty in implementing) of transforming the following constructs:
+
+    * macros expansion;
+
+    * namespaces:
+
+        - redefines of CUDA entities in user namespaces;
+
+        - using directive;
+
+    * templates (some cases);
+
+    * device/host function calls distinguishing;
+
+    * header files correct injection;
+
+    * complicated argument lists parsing.
+
+2. Difficulties in supporting.
+
+### <a name="hipify-perl-usage"></a> hipify-perl usage
+
+To generate `hipify-perl`, run `hipify-clang --perl`.
+
+```shell
+perl hipify-perl square.cu > square.cu.hip
+```
+
 ## <a name="cuda-apis"></a> Supported CUDA APIs
 
 - [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
@@ -391,47 +473,6 @@ cmake
 -- Generating done
 -- Build files have been written to: f:/HIP/hipify-clang/build
 ```
-
-## <a name="running-and-using-hipify-clang"></a> Running and using hipify-clang
-
-To process a file, `hipify-clang` needs access to the same headers that would be needed to compile it with clang.
-
-For example:
-
-```shell
-./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
-```
-
-`hipify-clang` arguments are given first, followed by a separator, and then the arguments you'd pass to `clang` if you
-were compiling the input file. The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code)
-may be useful.
-
-For a list of `hipify-clang` options, run `hipify-clang --help`.
-
-### <a name="perl"></a> hipify-perl
-
-To produce a Perl-based script `hipify-perl`, run `hipify-clang --perl`.
-
-The `hipify-perl` script, unlike the `hipify-clang`, being based on regular expressions, and not on an abstract syntax tree, has several gaps:
-
-1. macros expansion;
-
-2. namespaces:
-
-    - redefines of CUDA entities in user namespaces;
-
-    - using directive;
-
-3. templates (some cases);
-
-4. device/host function calls distinguishing;
-
-5. header files correct injection;
-
-6. complicated argument lists parsing.
-
-Nonetheless, `hipify-perl` is easy in use and doesn't check the input source CUDA code for correctness.
-
 ## <a name="disclaimer"></a> Disclaimer
 
 The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale.

From 8fc262ef2379293671f471b1d413f6b3b862579f Mon Sep 17 00:00:00 2001
From: Alexey Chernov <4ernov@gmail.com>
Date: Fri, 24 Jan 2020 13:51:49 +0300
Subject: [PATCH 07/23] Clear `HIP_PATH` before version detection (#1786)

Don't allow `HIP_PATH` to be propagated to `hipconfig`,
when run by CMake to detect the package version, as it
leads to the wrong version is detected: when there's
already HIP of some different version installed in the
system and `HIP_PATH` points to its location, `hipconfig`
tends to return the version of the installed HIP, rather
than the value defined for the distribution. The compiled
results report wrong version and spoils the rest of the
stack in this case.
---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a1d940421..3eedf35b09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ endmacro()
 # Setup version information
 #############################
 # Determine HIP_BASE_VERSION
+set(ENV{HIP_PATH} "")
 execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --version
     OUTPUT_VARIABLE HIP_BASE_VERSION
     OUTPUT_STRIP_TRAILING_WHITESPACE)

From 6613a37b3bbf8ef351bbb7585f1121014ee3d530 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Fri, 24 Jan 2020 05:52:49 -0500
Subject: [PATCH 08/23] Fix associate code object symbols with host allocation
 bug (#1799)

The current implementation skips this procedure for a given device
object when a global symbol is found in the cache.  This is incorrect:

 - There could be other undefined globals that have not been previously
encountered further down the list
 - If a symbol is found in the cache, it doesn't need to be pinned again
but it still need to be defined for the current executable

Added special case for the printf buffer symbol (already pinned by HCC)

The bug was exposed by running printf on different GPUs.
---
 src/program_state.cpp                |  2 +-
 src/program_state.inl                | 60 +++++++++++++++++++---------
 tests/src/kernel/hipPrintfKernel.cpp |  9 ++++-
 3 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/src/program_state.cpp b/src/program_state.cpp
index dbd7d3ebc4..5e9f9976be 100644
--- a/src/program_state.cpp
+++ b/src/program_state.cpp
@@ -61,7 +61,7 @@ namespace hip_impl {
         if (it == impl->get_globals().end())
             return nullptr;
         else
-            return it->second;
+            return it->second.first;
     }
 
     hsa_executable_t program_state::load_executable(const char* data,
diff --git a/src/program_state.inl b/src/program_state.inl
index 993418de96..8861558e04 100644
--- a/src/program_state.inl
+++ b/src/program_state.inl
@@ -18,6 +18,7 @@
 #include <hsa/hsa_ext_amd.h>
 #include <hsa/hsa_ven_amd_loader.h>
 #include <amd_comgr.h>
+#include "hc.hpp"
 
 #include <link.h>
 
@@ -193,7 +194,8 @@ public:
     std::tuple<
         std::once_flag,
         std::mutex,
-        std::unordered_map<std::string, void*>> globals;
+        // map from string to pair<global_addr, pinned_addr>
+        std::unordered_map<std::string, std::pair<void*, void*>>> globals;
 
     using RAII_code_reader =
         std::unique_ptr<hsa_code_object_reader_t, 
@@ -308,7 +310,7 @@ public:
         return symbol_addresses.second;
     }
 
-    std::unordered_map<std::string, void*>& get_globals() {
+    std::unordered_map<std::string, std::pair<void*, void*>>& get_globals() {
         std::call_once(std::get<0>(globals), [this]() { 
             std::get<2>(globals).reserve(get_symbol_addresses().size()); 
         });
@@ -349,30 +351,52 @@ public:
         auto& g_mutex = get_globals_mutex();
         for (auto&& x : undefined_symbols) {
 
-            if (g.find(x) != g.cend()) return;
-
             const auto it1 = get_symbol_addresses().find(x);
-
             if (it1 == get_symbol_addresses().cend()) {
                 hip_throw(std::runtime_error{
                     "Global symbol: " + x + " is undefined."});
             }
 
-            std::lock_guard<std::mutex> lck{g_mutex};
+            hsa_status_t status;
+            auto check_hsa_global_var_define_error = [&x](hsa_status_t s) {
+                if (s != HSA_STATUS_SUCCESS) {
+                    const char* es;
+                    hsa_status_string(s, &es);
+                    hip_throw(std::runtime_error{ "Error when defining symbol " + x + " : " + es});
+                }
+            };
 
-            if (g.find(x) != g.cend()) return;
+            auto retrieve_pinned_address_from_cache = [](decltype(g) g, decltype(x) x) {
+                const auto& global_addr = g.find(x);
+                if (global_addr != g.cend()) {
+                    return global_addr->second.second;
+                }
+                return (void*)nullptr;
+            };
 
-            g.emplace(x, (void*)(it1->second.first));
-            void* p = nullptr;
-            hsa_amd_memory_lock(
-                reinterpret_cast<void*>(it1->second.first),
-                it1->second.second,
-                nullptr,  // All agents.
-                0,
-                &p);
-
-            hsa_executable_agent_global_variable_define(
-                executable, agent, x.c_str(), p);
+            void* p = retrieve_pinned_address_from_cache(g, x);
+            if (p == nullptr) {
+                std::lock_guard<std::mutex> lck{g_mutex};
+                p = retrieve_pinned_address_from_cache(g, x);
+                if (p == nullptr) {
+                    if (x == "_ZN2hc13printf_bufferE") {
+                        // This is the printf buffer, get the pinned address from HCC
+                        p = Kalmar::getContext()->getPrintfBufferPointerVA();
+                    } 
+                    else {
+                        status = hsa_amd_memory_lock(reinterpret_cast<void*>(it1->second.first),
+                                                     it1->second.second,
+                                                     nullptr,  // All agents.
+                                                     0, &p);
+                        check_hsa_global_var_define_error(status);
+                    }
+                    // cache the global address and its pinned address
+                    g.emplace(x, std::make_pair(reinterpret_cast<void*>(it1->second.first), p));
+                }
+            }
+            status = hsa_executable_agent_global_variable_define(
+                         executable, agent, x.c_str(), p);
+            check_hsa_global_var_define_error(status);
         }
     }
 
diff --git a/tests/src/kernel/hipPrintfKernel.cpp b/tests/src/kernel/hipPrintfKernel.cpp
index 1d4fa5fe30..5675f2e6bd 100644
--- a/tests/src/kernel/hipPrintfKernel.cpp
+++ b/tests/src/kernel/hipPrintfKernel.cpp
@@ -30,7 +30,12 @@ THE SOFTWARE.
 __global__ void run_printf() { printf("Hello World\n"); }
 
 int main() {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
-    hipDeviceSynchronize();
+    int device_count = 0;
+    hipGetDeviceCount(&device_count);
+    for (int i = 0; i < device_count; ++i) {
+        hipSetDevice(i);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
+        hipDeviceSynchronize();
+    }
     passed();
 }

From f653c8c9994212e22fb0bbbd1108671e83321ed6 Mon Sep 17 00:00:00 2001
From: paulfreddy <52053501+paulfreddy@users.noreply.github.com>
Date: Fri, 24 Jan 2020 02:52:57 -0800
Subject: [PATCH 09/23] Fix install script syntax error (#1805)

Fix hip-nvcc install warning on postinstall and prerm script
---
 packaging/hip-nvcc.postinst | 2 +-
 packaging/hip-nvcc.prerm    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packaging/hip-nvcc.postinst b/packaging/hip-nvcc.postinst
index 2f901324cb..b70cf2848f 100755
--- a/packaging/hip-nvcc.postinst
+++ b/packaging/hip-nvcc.postinst
@@ -3,6 +3,6 @@
 ROCMDIR=@ROCM_PATH@
 HIPDIR=$ROCMDIR/hip
 
-if [ -d $ROCMDIR]
+if [ -d $ROCMDIR] ; then
     ln -s -f $ROCMDIR /opt/rocm
 fi
diff --git a/packaging/hip-nvcc.prerm b/packaging/hip-nvcc.prerm
index 96875e4a9c..baa0e6f5c7 100755
--- a/packaging/hip-nvcc.prerm
+++ b/packaging/hip-nvcc.prerm
@@ -1,5 +1,5 @@
 #!/bin/bash
 
-if [ -L "/opt/rocm" ]
+if [ -L "/opt/rocm" ] ; then
    unlink /opt/rocm
 fi

From bed8f1c1b8a865add45035e6f4216d5cbf5b7aa1 Mon Sep 17 00:00:00 2001
From: mshivama <47909405+mshivama@users.noreply.github.com>
Date: Fri, 24 Jan 2020 16:23:28 +0530
Subject: [PATCH 10/23] SWDEV-220503:  this_grid().thread_rank() gives
 incorrect result (#1808)

* fix a minor bug while computing this.grid()::thread_rank()
---
 include/hip/hcc_detail/hip_cooperative_groups_helper.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/hip/hcc_detail/hip_cooperative_groups_helper.h b/include/hip/hcc_detail/hip_cooperative_groups_helper.h
index b74d16d23b..9738448d94 100644
--- a/include/hip/hcc_detail/hip_cooperative_groups_helper.h
+++ b/include/hip/hcc_detail/hip_cooperative_groups_helper.h
@@ -106,7 +106,7 @@ __CG_STATIC_QUALIFIER__ uint32_t size() {
 }
 
 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  // Compute global id of the workgroup to which the current threads belongs to
+  // Compute global id of the workgroup to which the current thread belongs to
   uint32_t blkIdx =
            (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
                       (hipBlockIdx_y * hipGridDim_x) +
@@ -115,7 +115,7 @@ __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
   // Compute total number of threads being passed to reach current workgroup
   // within grid
   uint32_t num_threads_till_current_workgroup =
-           (uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z));
+           (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
 
   // Compute thread local rank within current workgroup
   uint32_t local_thread_rank =

From 27275c6a2cce0ea6c7961eaa5eba4a6767f53202 Mon Sep 17 00:00:00 2001
From: aakanksha555 <41199349+aakanksha555@users.noreply.github.com>
Date: Fri, 24 Jan 2020 05:53:47 -0500
Subject: [PATCH 11/23] Fix for a syntax error in deb packages (#1814)

---
 hipify-clang/packaging/hipify-clang.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipify-clang/packaging/hipify-clang.txt b/hipify-clang/packaging/hipify-clang.txt
index 5f78e7e67e..b189eff1e6 100644
--- a/hipify-clang/packaging/hipify-clang.txt
+++ b/hipify-clang/packaging/hipify-clang.txt
@@ -48,7 +48,7 @@ endif()
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda >= 7.0")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda (>= 7.0)")
 set(CPACK_BINARY_RPM "ON")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV "NO")

From 9cfada0f9d5a842889a14584cc3b63000fbc6ecd Mon Sep 17 00:00:00 2001
From: vsytch <vsytch@hotmail.com>
Date: Fri, 24 Jan 2020 14:41:15 -0500
Subject: [PATCH 12/23] Update the HIP_TRSF_* flags to match their Cuda
 equivalents. (#1801)

---
 include/hip/hcc_detail/driver_types.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/hip/hcc_detail/driver_types.h b/include/hip/hcc_detail/driver_types.h
index 510d3d058e..0c29542c7e 100644
--- a/include/hip/hcc_detail/driver_types.h
+++ b/include/hip/hcc_detail/driver_types.h
@@ -43,9 +43,10 @@ typedef struct hipChannelFormatDesc {
     enum hipChannelFormatKind f;
 }hipChannelFormatDesc;
 
-#define HIP_TRSF_NORMALIZED_COORDINATES 0x01
-#define HIP_TRSF_READ_AS_INTEGER 0x00
 #define HIP_TRSA_OVERRIDE_FORMAT 0x01
+#define HIP_TRSF_READ_AS_INTEGER 0x01
+#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
+#define HIP_TRSF_SRGB 0x10
 
 typedef enum hipArray_Format {
     HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,

From e3026e99bb783b39e3c83f6b020bf3464d78767b Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Mon, 27 Jan 2020 12:37:26 +0300
Subject: [PATCH 13/23] [HIPIFY][#1819] Fix for not found clang's system header
 files

Use absolute paths instead of relative ones.
---
 hipify-clang/src/main.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/hipify-clang/src/main.cpp b/hipify-clang/src/main.cpp
index c75c24b066..844f6fa8f0 100644
--- a/hipify-clang/src/main.cpp
+++ b/hipify-clang/src/main.cpp
@@ -108,7 +108,7 @@ void sortInputFiles(int argc, const char **argv, std::vector<std::string> &files
   files.assign(sortedFiles.begin(), sortedFiles.end());
 }
 
-void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath) {
+void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath, const char *hipify_exe) {
   if (!IncludeDirs.empty()) {
     for (std::string s : IncludeDirs) {
       Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(s.c_str(), ct::ArgumentInsertPosition::BEGIN));
@@ -122,9 +122,14 @@ void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSou
     }
   }
   // Includes for clang's CUDA wrappers for using by packaged hipify-clang
-  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include", ct::ArgumentInsertPosition::BEGIN));
+  static int Dummy;
+  std::string hipify = llvm::sys::fs::getMainExecutable(hipify_exe, (void *)&Dummy);
+  std::string clang_inc_path = llvm::sys::path::parent_path(hipify);
+  clang_inc_path.append("/include");
+  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN));
   Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN));
-  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include/cuda_wrappers", ct::ArgumentInsertPosition::BEGIN));
+  clang_inc_path.append("/cuda_wrappers");
+  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN));
   Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN));
   // Ensure at least c++11 is used.
   std::string stdCpp = "-std=c++11";
@@ -315,7 +320,7 @@ int main(int argc, const char **argv) {
     ct::RefactoringTool Tool(OptionsParser.getCompilations(), std::string(tmpFile.c_str()));
     ct::Replacements &replacementsToUse = llcompat::getReplacements(Tool, tmpFile.c_str());
     ReplacementsFrontendActionFactory<HipifyAction> actionFactory(&replacementsToUse);
-    appendArgumentsAdjusters(Tool, sSourceAbsPath);
+    appendArgumentsAdjusters(Tool, sSourceAbsPath, argv[0]);
     Statistics &currentStat = Statistics::current();
     // Hipify _all_ the things!
     if (Tool.runAndSave(&actionFactory)) {

From e561d0edfe4f3532c8674a1df78a866dfd6c989b Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Mon, 27 Jan 2020 17:21:32 +0300
Subject: [PATCH 14/23] [HIPIFY] Update Readme.md

Restructure contents in order to have separated same-level information about hipify-clang and hipify-perl.
---
 hipify-clang/README.md | 171 +++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 85 deletions(-)

diff --git a/hipify-clang/README.md b/hipify-clang/README.md
index 6607be198c..45cd6657e1 100644
--- a/hipify-clang/README.md
+++ b/hipify-clang/README.md
@@ -5,16 +5,16 @@
 <!-- toc -->
 
 - [hipify-clang](#clang)
+     * [Dependencies](#dependencies)
      * [Usage](#hipify-clang-usage)
-- [hipify-perl](#perl)
-     * [Usage](#hipify-perl-usage)
-- [Supported CUDA APIs](#cuda-apis)
-- [Dependencies](#dependencies)
-- [Build and install](#build-and-install)
      * [Building](#building)
      * [Testing](#testing)
      * [Linux](#linux)
      * [Windows](#windows)
+- [hipify-perl](#perl)
+     * [Usage](#hipify-perl-usage)
+     * [Building](#hipify-perl-building)
+- [Supported CUDA APIs](#cuda-apis)
 - [Disclaimer](#disclaimer)
 
 <!-- tocstop -->
@@ -38,81 +38,7 @@ After applying all the matchers, the output HIP source is produced.
 2. CUDA should be installed and provided in case of multiple installations by --cuda-path option.
 3. All the includes and defines should be provided to transform code successfully.
 
-### <a name="hipify-clang-usage"></a> hipify-clang usage
-
-To process a file, `hipify-clang` needs access to the same headers that would be required to compile it with clang.
-
-For example:
-
-```shell
-./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
-```
-
-`hipify-clang` arguments are given first, followed by a separator '--', and then the arguments you'd pass to `clang` if you
-were compiling the input file. For example:
-
-```shell
-./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
-```
-
-The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
-
-For a list of `hipify-clang` options, run `hipify-clang --help`.
-
-## <a name="perl"></a> hipify-perl
-
-`hipify-perl` is autogenerated perl-based script which heavily uses regular expressions.
-
-**Advantages:**
-
-1. Ease in use.
-
-2. It doesn't check the input source CUDA code for correctness.
-
-3. It doesn't have dependencies on 3rd party tools, including CUDA.
-
-**Disadvantages:**
-
-1. Current disability (and difficulty in implementing) of transforming the following constructs:
-
-    * macros expansion;
-
-    * namespaces:
-
-        - redefines of CUDA entities in user namespaces;
-
-        - using directive;
-
-    * templates (some cases);
-
-    * device/host function calls distinguishing;
-
-    * header files correct injection;
-
-    * complicated argument lists parsing.
-
-2. Difficulties in supporting.
-
-### <a name="hipify-perl-usage"></a> hipify-perl usage
-
-To generate `hipify-perl`, run `hipify-clang --perl`.
-
-```shell
-perl hipify-perl square.cu > square.cu.hip
-```
-
-## <a name="cuda-apis"></a> Supported CUDA APIs
-
-- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
-- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
-- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md)
-- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md)
-- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md)
-- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md)
-- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md)
-- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md)
-
-## <a name="dependencies"></a> Dependencies
+### <a name="dependencies"></a> hipify-clang: dependencies
 
 `hipify-clang` requires:
 
@@ -148,9 +74,28 @@ In most cases, you can get a suitable version of LLVM+CLANG with your package ma
 Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
 [CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist`
 
-## <a name="build-and-install"></a> Build and install
+### <a name="hipify-clang-usage"></a> hipify-clang: usage
 
-### <a name="building"></a> Build
+To process a file, `hipify-clang` needs access to the same headers that would be required to compile it with clang.
+
+For example:
+
+```shell
+./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
+```
+
+`hipify-clang` arguments are given first, followed by a separator '--', and then the arguments you'd pass to `clang` if you
+were compiling the input file. For example:
+
+```shell
+./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
+```
+
+The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
+
+For a list of `hipify-clang` options, run `hipify-clang --help`.
+
+### <a name="building"></a> hipify-clang: building
 
 Assuming this repository is at `./HIP`:
 
@@ -173,7 +118,7 @@ Debug build type `-DCMAKE_BUILD_TYPE=Debug` is also supported and tested; `LLVM+
 
 The binary can then be found at `./dist/bin/hipify-clang`.
 
-### <a name="testing"></a> Testing
+### <a name="testing"></a> hipify-clang: testing
 
 `hipify-clang` has unit tests using LLVM [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).
 
@@ -277,7 +222,7 @@ To run it:
 
      - ***Windows***: run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
 
-### <a name="linux"></a >Linux
+### <a name="linux"></a > hipify-clang: Linux
 
 On Linux the following configurations are tested:
 
@@ -427,7 +372,7 @@ Testing Time: 3.07s
   Expected Passes    : 67
 [100%] Built target test-hipify
 ```
-### <a name="windows"></a >Windows
+### <a name="windows"></a > hipify-clang: Windows
 
 On Windows 10 the following configurations are tested:
 
@@ -473,6 +418,62 @@ cmake
 -- Generating done
 -- Build files have been written to: f:/HIP/hipify-clang/build
 ```
+
+## <a name="perl"></a> hipify-perl
+
+`hipify-perl` is autogenerated perl-based script which heavily uses regular expressions.
+
+**Advantages:**
+
+1. Ease in use.
+
+2. It doesn't check the input source CUDA code for correctness.
+
+3. It doesn't have dependencies on 3rd party tools, including CUDA.
+
+**Disadvantages:**
+
+1. Current disability (and difficulty in implementing) of transforming the following constructs:
+
+    * macros expansion;
+
+    * namespaces:
+
+        - redefines of CUDA entities in user namespaces;
+
+        - using directive;
+
+    * templates (some cases);
+
+    * device/host function calls distinguishing;
+
+    * header files correct injection;
+
+    * complicated argument lists parsing.
+
+2. Difficulties in supporting.
+
+### <a name="hipify-perl-usage"></a> hipify-perl: usage
+
+```shell
+perl hipify-perl square.cu > square.cu.hip
+```
+
+### <a name="hipify-perl-building"></a> hipify-perl: building
+
+To generate `hipify-perl`, run `hipify-clang --perl`. Output directory for the generated `hipify-perl` file might be specified by `--o-hipify-perl-dir` option.
+
+## <a name="cuda-apis"></a> Supported CUDA APIs
+
+- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
+- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
+- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md)
+- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md)
+- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md)
+- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md)
+- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md)
+- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md)
+
 ## <a name="disclaimer"></a> Disclaimer
 
 The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale.

From f72a669487dd352e45321c4b3038f8fe2365c236 Mon Sep 17 00:00:00 2001
From: vsytch <vsytch@hotmail.com>
Date: Mon, 27 Jan 2020 19:37:00 -0500
Subject: [PATCH 15/23] Add missing texturePitchAlignment member to the
 hipDeviceProp_t struct. (#1802)

* Add missing texturePitchAlignment member to the hipDeviceProp_t struct.

* Add missing hipDeviceAttributeTexturePitchAlignment enumerator to the hipDeviceAttribute_t enum.

* Initialize texturePitchAlignment to 256. This works for gfx9+, but is technically overaligned in most cases for pre-gfx9.

* Add the texturePitchAlignment property to the NVCC path.
---
 include/hip/hip_runtime_api.h             | 2 ++
 include/hip/nvcc_detail/hip_runtime_api.h | 4 ++++
 src/hip_device.cpp                        | 5 ++++-
 src/hip_hcc.cpp                           | 1 +
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index e725846cbd..64b2a85d8a 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -124,6 +124,7 @@ typedef struct hipDeviceProp_t {
     unsigned int* hdpRegFlushCntl;      ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
     size_t memPitch;                 ///<Maximum pitch in bytes allowed by memory copies
     size_t textureAlignment;         ///<Alignment requirement for textures
+    size_t texturePitchAlignment;    ///<Pitch alignment requirement for texture references bound to pitched memory
     int kernelExecTimeoutEnabled;    ///<Run time limit for kernels executed on the device
     int ECCEnabled;                  ///<Device has ECC support enabled
     int tccDriver;                   ///< 1:If device is Tesla device using TCC driver, else 0
@@ -321,6 +322,7 @@ typedef enum hipDeviceAttribute_t {
 
     hipDeviceAttributeMaxPitch,             ///< Maximum pitch in bytes allowed by memory copies
     hipDeviceAttributeTextureAlignment,     ///<Alignment requirement for textures
+    hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
     hipDeviceAttributeKernelExecTimeout,    ///<Run time limit for kernels executed on the device
     hipDeviceAttributeCanMapHostMemory,     ///<Device can map host memory into device address space
     hipDeviceAttributeEccEnabled            ///<Device has ECC support enabled
diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h
index 0e7c936fc7..05a162478a 100644
--- a/include/hip/nvcc_detail/hip_runtime_api.h
+++ b/include/hip/nvcc_detail/hip_runtime_api.h
@@ -1130,6 +1130,7 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev
 
     p_prop->memPitch                 = cdprop.memPitch;
     p_prop->textureAlignment         = cdprop.textureAlignment;
+    p_prop->texturePitchAlignment    = cdprop.texturePitchAlignment;
     p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
     p_prop->ECCEnabled               = cdprop.ECCEnabled;
     p_prop->tccDriver                = cdprop.tccDriver;
@@ -1244,6 +1245,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeTextureAlignment:
             cdattr = cudaDevAttrTextureAlignment;
             break;
+        case hipDeviceAttributeTexturePitchAlignment:
+            cdattr = cudaDevAttrTexturePitchAlignment;
+            break;
         case hipDeviceAttributeKernelExecTimeout:
             cdattr = cudaDevAttrKernelExecTimeout;
             break;
diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index 403194483a..aa89e62271 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -312,9 +312,12 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device
             case hipDeviceAttributeMaxPitch:
                 *pi = prop->memPitch;
                 break;
-	    case hipDeviceAttributeTextureAlignment:
+            case hipDeviceAttributeTextureAlignment:
                 *pi = prop->textureAlignment;
                 break;
+            case hipDeviceAttributeTexturePitchAlignment:
+                *pi = prop->texturePitchAlignment;
+                 break;
             case hipDeviceAttributeKernelExecTimeout:
                 *pi = prop->kernelExecTimeoutEnabled;
                 break;
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 63bc8fe14f..175d301ee1 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -925,6 +925,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
 
     prop->memPitch = INT_MAX; //Maximum pitch in bytes allowed by memory copies (hardcoded 128 bytes in hipMallocPitch)
     prop->textureAlignment = 0; //Alignment requirement for textures
+    prop->texturePitchAlignment = IMAGE_PITCH_ALIGNMENT; //Alignment requirment for texture pitch
     prop->kernelExecTimeoutEnabled = 0; //no run time limit for running kernels on device
 
     hsa_isa_t isa;

From fa98798b63be461240da98c0c719bf6a92fa44d0 Mon Sep 17 00:00:00 2001
From: satyanveshd <53337087+satyanveshd@users.noreply.github.com>
Date: Thu, 30 Jan 2020 02:35:53 +0530
Subject: [PATCH 16/23] Match Occupancy APIs syntax with CUDA (#1625)

* Match Occupancy APIs syntax with CUDA and fix tests using these APIs
---
 .../hip/hcc_detail/functional_grid_launch.hpp | 14 --------
 include/hip/hcc_detail/hip_runtime_api.h      | 33 +++++++++----------
 samples/2_Cookbook/13_occupancy/occupancy.cpp |  4 +--
 src/hip_module.cpp                            | 20 +++++++----
 .../module/hipLaunchCooperativeKernel.cpp     |  2 +-
 ...upancyMaxActiveBlocksPerMultiprocessor.cpp | 21 ++----------
 .../hipOccupancyMaxPotentialBlockSize.cpp     |  1 -
 7 files changed, 35 insertions(+), 60 deletions(-)
 rename tests/src/runtimeApi/{module => occupancy}/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp (80%)
 rename tests/src/runtimeApi/{module => occupancy}/hipOccupancyMaxPotentialBlockSize.cpp (99%)

diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp
index 76a04fa355..9eb738cf04 100644
--- a/include/hip/hcc_detail/functional_grid_launch.hpp
+++ b/include/hip/hcc_detail/functional_grid_launch.hpp
@@ -154,20 +154,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
                                       dynSharedMemPerBlk, blockSizeLimit);
 }
 
-template <typename F>
-inline
-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
-    uint32_t blockSize, size_t dynSharedMemPerBlk) {
-
-    using namespace hip_impl;
-
-    hip_impl::hip_init();
-    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
-                                                   target_agent(0));
-
-    return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
-}
-
 template <typename... Args, typename F = void (*)(Args...)>
 inline
 void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h
index 0712db17f9..659a6c3c3a 100644
--- a/include/hip/hcc_detail/hip_runtime_api.h
+++ b/include/hip/hcc_detail/hip_runtime_api.h
@@ -2948,7 +2948,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
  * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
  */
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);
 
 /**
  * @brief Returns occupancy for a device function.
@@ -2960,7 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
  * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
  */
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
 
 #if __HIP_VDI__ && !defined(__HCC__)
 /**
@@ -3230,21 +3230,6 @@ hipError_t hipLaunchKernel(const void* function_address,
 } /* extern "c" */
 #endif
 
-#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
-template <typename F>
-static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor(
-    uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) {
-    return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize,
-                                                          dynSharedMemPerBlk);
-}
-template <typename F>
-static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
-    return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-        numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags);
-}
-#endif  // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
-
 #if USE_PROF_API
 #include <hip/hcc_detail/hip_prof_str.h>
 #endif
@@ -3385,6 +3370,20 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
     return hipSuccess;
 }
 
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
+}
+
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
+}
+
 #if __HIP_VDI__ && !defined(__HCC__)
 template <class T>
 inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
diff --git a/samples/2_Cookbook/13_occupancy/occupancy.cpp b/samples/2_Cookbook/13_occupancy/occupancy.cpp
index a9f4e198b0..605c7724b2 100644
--- a/samples/2_Cookbook/13_occupancy/occupancy.cpp
+++ b/samples/2_Cookbook/13_occupancy/occupancy.cpp
@@ -86,8 +86,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){
      printf("kernel Execution time = %6.3fms\n", eventMs);
 
      //Calculate Occupancy
-     uint32_t numBlock = 0;
-     HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
+     int numBlock = 0;
+     HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply,(int)blockSize, 0));
      
      if(devProp.maxThreadsPerMultiProcessor){
 	std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl;
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index b11197703f..a8255ea725 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -1368,7 +1368,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
 }
 
 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-   TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+   TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
 {
     using namespace hip_impl;
 
@@ -1408,35 +1408,41 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
         : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
 
     // Calculate blocks occupancy per CU based on SGPR usage
-    *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
+    *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));
 
     size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
     if (total_used_lds != 0) {
       // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
       size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
-      *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
+      *numBlocks = std::min(*numBlocks, (int) lds_occupancy);
     }
 
     return hipSuccess;
 }
 
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk)
 {
     HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
+   
+    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+                                                   hip_impl::target_agent(0));
 
     return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+        tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
 }
 
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t  blockSize, size_t dynSharedMemPerBlk,
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk,
    unsigned int flags)
 {
     HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
 
+    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+                                                   hip_impl::target_agent(0));
+
     return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+        tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
 }
 
 hipError_t hipLaunchKernel(
diff --git a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp
index c76685fa89..896738892d 100644
--- a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp
+++ b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp
@@ -116,7 +116,7 @@ int main() {
 
     dimBlock.x = workgroups[i];
     // Calculate the device occupancy to know how many blocks can be run concurrently
-    hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
+    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
         test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));
 
     dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
similarity index 80%
rename from tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
rename to tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
index ebf656b72f..8e0dd033bc 100644
--- a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
+++ b/tests/src/runtimeApi/occupancy/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
@@ -30,10 +30,6 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 
-#define fileName "vcpy_kernel.code"
-#define kernel_name "hello_world"
-
-
 __global__ void f1(float *a) { *a = 1.0; }
 
 template <typename T>
@@ -49,11 +45,10 @@ int main(int argc, char* argv[]) {
     hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
     assert(gridSize != 0 && blockSize != 0);
 
-    uint32_t numBlock = 0;
-    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
+    int numBlock = 0;
+    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, (int)blockSize, 0);
     assert(numBlock != 0);
 
-
     // test case for using kernel function pointer with template
     gridSize = 0;
     blockSize = 0;
@@ -61,17 +56,7 @@ int main(int argc, char* argv[]) {
     assert(gridSize != 0 && blockSize != 0);
 
     numBlock = 0;
-    hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
-    assert(numBlock != 0);
-
-
-    // test case for using kernel with hipFunction_t type
-    numBlock = 0;
-    hipModule_t Module;
-    hipFunction_t Function;
-    HIPCHECK(hipModuleLoad(&Module, fileName));
-    HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
+    hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, (int)blockSize, 0);
     assert(numBlock != 0);
 
     passed();
diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp
similarity index 99%
rename from tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp
rename to tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp
index a81862952d..d29100d9a9 100644
--- a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp
+++ b/tests/src/runtimeApi/occupancy/hipOccupancyMaxPotentialBlockSize.cpp
@@ -33,7 +33,6 @@ THE SOFTWARE.
 #define fileName "vcpy_kernel.code"
 #define kernel_name "hello_world"
 
-
 __global__ void f1(float *a) { *a = 1.0; }
 
 template <typename T>

From 3684ef813d495225b1c472960d51753f04ee57cd Mon Sep 17 00:00:00 2001
From: jujiang-del <56359287+jujiang-del@users.noreply.github.com>
Date: Thu, 30 Jan 2020 17:04:41 -0500
Subject: [PATCH 17/23] [dtest] Avoid resViewDesc for non-array resource in
 texture creation (#1823)

---
 tests/src/texture/hipTexObjPitch.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/src/texture/hipTexObjPitch.cpp b/tests/src/texture/hipTexObjPitch.cpp
index 47648d5e73..b11e7408ae 100644
--- a/tests/src/texture/hipTexObjPitch.cpp
+++ b/tests/src/texture/hipTexObjPitch.cpp
@@ -76,8 +76,7 @@ void texture2Dtest()
     texDescr.readMode = hipReadModeElementType;
 
     hipTextureObject_t texObj;
-    hipResourceViewDesc resDesc;
-    HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, &resDesc));
+    HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, NULL));
 
     HIPCHECK(hipMalloc((void**)&devPtrB, SIZE_W*sizeof(TYPE_t)*SIZE_H)) ;
 

From 58906beac9638a00cb846192530949b723a4f5c2 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Sat, 1 Feb 2020 14:41:16 +0300
Subject: [PATCH 18/23] [HIPIFY] Fix build failure against ToT trunk LLVM
 11.0.0

+ Add an explicit cast when assigning StringRef to std::string
---
 hipify-clang/src/HipifyAction.cpp | 4 ++--
 hipify-clang/src/LLVMCompat.cpp   | 2 +-
 hipify-clang/src/main.cpp         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp
index dee9a25d49..75138c47ab 100644
--- a/hipify-clang/src/HipifyAction.cpp
+++ b/hipify-clang/src/HipifyAction.cpp
@@ -213,7 +213,7 @@ StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exp
   */
 std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) {
   if (clang::isa<clang::CXXDefaultArgExpr>(arg)) return "0";
-  else return readSourceText(SM, arg->getSourceRange());
+  else return std::string(readSourceText(SM, arg->getSourceRange()));
 }
 
 } // anonymous namespace
@@ -427,7 +427,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchRes
         clang::LangOptions LO;
         LO.CUDA = true;
         clang::PrintingPolicy policy(LO);
-        typeName = BT->getName(policy);
+        typeName = std::string(BT->getName(policy));
       }
     } else {
       typeName = QT.getAsString();
diff --git a/hipify-clang/src/LLVMCompat.cpp b/hipify-clang/src/LLVMCompat.cpp
index 604841bcd9..f6d74121e4 100644
--- a/hipify-clang/src/LLVMCompat.cpp
+++ b/hipify-clang/src/LLVMCompat.cpp
@@ -44,7 +44,7 @@ ct::Replacements &getReplacements(ct::RefactoringTool &Tool, StringRef file) {
 #if LLVM_VERSION_MAJOR > 3
   // getReplacements() now returns a map from filename to Replacements - so create an entry
   // for this source file and return a reference to it.
-  return Tool.getReplacements()[file];
+  return Tool.getReplacements()[std::string(file)];
 #else
   return Tool.getReplacements();
 #endif
diff --git a/hipify-clang/src/main.cpp b/hipify-clang/src/main.cpp
index 844f6fa8f0..cb411eba2f 100644
--- a/hipify-clang/src/main.cpp
+++ b/hipify-clang/src/main.cpp
@@ -124,7 +124,7 @@ void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSou
   // Includes for clang's CUDA wrappers for using by packaged hipify-clang
   static int Dummy;
   std::string hipify = llvm::sys::fs::getMainExecutable(hipify_exe, (void *)&Dummy);
-  std::string clang_inc_path = llvm::sys::path::parent_path(hipify);
+  std::string clang_inc_path = std::string(llvm::sys::path::parent_path(hipify));
   clang_inc_path.append("/include");
   Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN));
   Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN));

From 0e7ee5b7b0fdab179b3cccd8ce40384d85a57b84 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Sat, 1 Feb 2020 16:30:05 +0300
Subject: [PATCH 19/23] [HIPIFY][cmake] CUDA 10.2 support starting from LLVM
 10.0

+ hipify-clang unit tests are passed against CUDA 10.2
---
 hipify-clang/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt
index fcc48c2088..875b5dad74 100644
--- a/hipify-clang/CMakeLists.txt
+++ b/hipify-clang/CMakeLists.txt
@@ -160,7 +160,7 @@ if (HIPIFY_CLANG_TESTS)
         (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0") OR
         (CUDA_VERSION VERSION_GREATER "9.2" AND LLVM_PACKAGE_VERSION VERSION_LESS "8.0") OR
         (CUDA_VERSION VERSION_GREATER "10.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "9.0") OR
-        (CUDA_VERSION VERSION_GREATER "10.1"))
+        (CUDA_VERSION VERSION_GREATER "10.1" AND LLVM_PACKAGE_VERSION VERSION_LESS "10.0"))
         message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by LLVM ${LLVM_PACKAGE_VERSION}.")
         if (CUDA_VERSION_MAJOR VERSION_LESS "7")
             message(STATUS "Please install CUDA 7.0 or higher.")
@@ -176,8 +176,8 @@ if (HIPIFY_CLANG_TESTS)
             message(STATUS "Please install LLVM + clang 8.0 or higher.")
         elseif (CUDA_VERSION VERSION_EQUAL "10.1")
             message(STATUS "Please install LLVM + clang 9.0 or higher.")
-        elseif (CUDA_VERSION VERSION_GREATER "10.1")
-            message(STATUS "Please install CUDA 10.1 or lesser.")
+        elseif (CUDA_VERSION VERSION_EQUAL "10.2")
+            message(STATUS "Please install LLVM + clang 10.0 or higher.")
         endif()
     endif()
 

From 9942c9d7cf525a1499ad537c315fc6141c765c07 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Sat, 1 Feb 2020 18:47:46 +0300
Subject: [PATCH 20/23] [HIPIFY][doc] CUDA 10.2 is now supported by LLVM
 10.0.0-rc1

---
 hipify-clang/README.md | 60 +++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/hipify-clang/README.md b/hipify-clang/README.md
index 45cd6657e1..bc29af9772 100644
--- a/hipify-clang/README.md
+++ b/hipify-clang/README.md
@@ -42,9 +42,10 @@ After applying all the matchers, the output HIP source is produced.
 
 `hipify-clang` requires:
 
-1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1).
+1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
 
 2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
+To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest LLVM release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
 
 | **LLVM release version**                                   | **CUDA latest supported version**                                        | **Windows** | **Linux** |
 |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -66,6 +67,7 @@ After applying all the matchers, the output HIP source is produced.
 | [8.0.1](http://releases.llvm.org/download.html#8.0.1)      | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive)          | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
 | [9.0.0](http://releases.llvm.org/download.html#9.0.0)      | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base)     | +           | +         |
 | [**9.0.1**](http://releases.llvm.org/download.html#9.0.1)  | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
+| [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads)             | +           | +         |
 
 `*` Download the patch and unpack it into your LLVM distributive directory; a few header files will be overwritten; rebuilding of LLVM is not needed.
 
@@ -111,7 +113,8 @@ cmake \
 
 make -j install
 ```
-On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 15 2017` instead of `make.`
+On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 16 2019` instead of `make.`
+Please, see [hipify-clang: Windows](#windows) for the supported tools for building.
 
 Debug build type `-DCMAKE_BUILD_TYPE=Debug` is also supported and tested; `LLVM+CLANG` should be built in `Debug` mode as well.
 64-bit build mode (`-Thost=x64` on Windows) is also supported; `LLVM+CLANG` should be built in 64-bit mode as well.
@@ -122,18 +125,14 @@ The binary can then be found at `./dist/bin/hipify-clang`.
 
 `hipify-clang` has unit tests using LLVM [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).
 
-**LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive for testing.**
+LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive for testing.
+
+**LLVM 9.0.1 or older:**
 
-To run it:
 1. download [`LLVM`](http://releases.llvm.org/9.0.1/llvm-9.0.1.src.tar.xz)+[`CLANG`](http://releases.llvm.org/9.0.1/cfe-9.0.1.src.tar.xz) sources; 
-2. build [`LLVM+CLANG`](http://llvm.org/docs/CMake.html):
-   ```shell
-   cd llvm
-   mkdir build dist
-   cd build
-   ```
+2. build [`LLVM+CLANG`](http://releases.llvm.org/9.0.0/docs/CMake.html):
 
-     - **Linux**:
+ **Linux**:
    ```shell
         cmake \
          -DCMAKE_INSTALL_PREFIX=../dist \
@@ -143,8 +142,7 @@ To run it:
          ../llvm
         make -j install
    ```
-
-     - **Windows**:
+ **Windows**:
    ```shell
         cmake \
          -G "Visual Studio 16 2019" \
@@ -156,9 +154,40 @@ To run it:
          -Thost=x64 \
          ../llvm
    ```
-
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
 
+**LLVM 10.0.0 or newer:**
+
+1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc1.tar.gz) sources;
+2. build [`LLVM project`](http://llvm.org/docs/CMake.html):
+
+ **Linux**:
+   ```shell
+        cmake \
+         -DCMAKE_INSTALL_PREFIX=../dist \
+         -DLLVM_SOURCE_DIR=../llvm-project \
+         -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
+         -DLLVM_ENABLE_PROJECTS="clang" \
+         -DCMAKE_BUILD_TYPE=Release \
+         ../llvm-project/llvm
+        make -j install
+   ```
+ **Windows**:
+   ```shell
+        cmake \
+         -G "Visual Studio 16 2019" \
+         -A x64 \
+         -DCMAKE_INSTALL_PREFIX=../dist \
+         -DLLVM_SOURCE_DIR=../llvm-project \
+         -DLLVM_TARGETS_TO_BUILD="NVPTX" \
+         -DLLVM_ENABLE_PROJECTS="clang" \
+         -DCMAKE_BUILD_TYPE=Release \
+         -Thost=x64 \
+         ../llvm-project/llvm
+   ```
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
+
+
 
 3. Ensure [`CUDA`](https://developer.nvidia.com/cuda-toolkit-archive) of minimum version 7.0 is installed.
 
@@ -384,7 +413,7 @@ LLVM 7.0.0 - 9.0.1, CUDA 7.5 - 10.1 Update 2, cudnn 7.0.5.15 - 7.6.5.32
 
 Build system requirements for the latest configuration LLVM 9.0.1/CUDA 10.1 Update 2:
 
-Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.3).
+Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4).
 
 Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
 
@@ -481,4 +510,3 @@ The information contained herein is for informational purposes only, and is subj
 AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
 
 Copyright (c) 2014-2020 Advanced Micro Devices, Inc. All rights reserved.
-

From 7410228025588ed1f069f10a5e8ea22bb75e84c6 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Sun, 2 Feb 2020 01:46:58 +0300
Subject: [PATCH 21/23] [HIPIFY][doc] Update README.md

+ Links, versioning, formatting, cleanup
---
 hipify-clang/README.md | 56 ++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

diff --git a/hipify-clang/README.md b/hipify-clang/README.md
index bc29af9772..86a97dbe29 100644
--- a/hipify-clang/README.md
+++ b/hipify-clang/README.md
@@ -28,14 +28,14 @@ After applying all the matchers, the output HIP source is produced.
 **Advantages:**
 
 1. It is a translator; thus, any even very complicated constructs will be parsed successfully, or an error will be reported.
-2. It supports clang options like -I, -D, --cuda-path, etc.
+2. It supports clang options like [`-I`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-i-dir), [`-D`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-d-macro), [`--cuda-path`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-cuda-path), etc.
 3. Seamless support of new CUDA versions as it is clang's responsibility.
 4. Ease in support.
 
 **Disadvantages:**
 
 1. The main advantage is also the main disadvantage: the input CUDA code should be correct; incorrect code wouldn't be translated to HIP.
-2. CUDA should be installed and provided in case of multiple installations by --cuda-path option.
+2. CUDA should be installed and provided in case of multiple installations by `--cuda-path` option.
 3. All the includes and defines should be provided to transform code successfully.
 
 ### <a name="dependencies"></a> hipify-clang: dependencies
@@ -45,7 +45,7 @@ After applying all the matchers, the output HIP source is produced.
 1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
 
 2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
-To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest LLVM release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
+To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
 
 | **LLVM release version**                                   | **CUDA latest supported version**                                        | **Windows** | **Linux** |
 |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -69,11 +69,11 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download
 | [**9.0.1**](http://releases.llvm.org/download.html#9.0.1)  | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
 | [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads)             | +           | +         |
 
-`*` Download the patch and unpack it into your LLVM distributive directory; a few header files will be overwritten; rebuilding of LLVM is not needed.
+`*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.
 
-In most cases, you can get a suitable version of LLVM+CLANG with your package manager.
+In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager.
 
-Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
+Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
 [CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist`
 
 ### <a name="hipify-clang-usage"></a> hipify-clang: usage
@@ -86,10 +86,10 @@ For example:
 ./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
 ```
 
-`hipify-clang` arguments are given first, followed by a separator '--', and then the arguments you'd pass to `clang` if you
+`hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you
 were compiling the input file. For example:
 
-```shell
+```bash
 ./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
 ```
 
@@ -101,7 +101,7 @@ For a list of `hipify-clang` options, run `hipify-clang --help`.
 
 Assuming this repository is at `./HIP`:
 
-```shell
+```bash
 cd hipify-clang
 mkdir build dist
 cd build
@@ -123,9 +123,9 @@ The binary can then be found at `./dist/bin/hipify-clang`.
 
 ### <a name="testing"></a> hipify-clang: testing
 
-`hipify-clang` has unit tests using LLVM [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).
+`hipify-clang` has unit tests using `LLVM` [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).
 
-LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive for testing.
+`LLVM+CLANG` should be built from sources, pre-built binaries are not exhaustive for testing.
 
 **LLVM 9.0.1 or older:**
 
@@ -133,7 +133,7 @@ LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive f
 2. build [`LLVM+CLANG`](http://releases.llvm.org/9.0.0/docs/CMake.html):
 
  **Linux**:
-   ```shell
+   ```bash
         cmake \
          -DCMAKE_INSTALL_PREFIX=../dist \
          -DLLVM_SOURCE_DIR=../llvm \
@@ -154,7 +154,7 @@ LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive f
          -Thost=x64 \
          ../llvm
    ```
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
+Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
 
 **LLVM 10.0.0 or newer:**
 
@@ -162,7 +162,7 @@ LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive f
 2. build [`LLVM project`](http://llvm.org/docs/CMake.html):
 
  **Linux**:
-   ```shell
+   ```bash
         cmake \
          -DCMAKE_INSTALL_PREFIX=../dist \
          -DLLVM_SOURCE_DIR=../llvm-project \
@@ -185,9 +185,7 @@ LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive f
          -Thost=x64 \
          ../llvm-project/llvm
    ```
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
-
-
+Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
 
 3. Ensure [`CUDA`](https://developer.nvidia.com/cuda-toolkit-archive) of minimum version 7.0 is installed.
 
@@ -217,7 +215,7 @@ LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive f
 
 5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.
 
-6. Ensure `lit` and `FileCheck` are installed - these are distributed with LLVM.
+6. Ensure `lit` and `FileCheck` are installed - these are distributed with `LLVM`.
 
     * Install `lit` into `python`:
 
@@ -241,23 +239,15 @@ LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive f
 
 7. Set `HIPIFY_CLANG_TESTS` option turned on: `-DHIPIFY_CLANG_TESTS=1`.
 
-8. Run `cmake`:
-     * [***Linux***](#linux)
-     * [***Windows***](#windows)
+8. Build and run tests:
 
-9. Run tests:
-
-     - ***Linux***: `make test-hipify`.
-
-     - ***Windows***: run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
-
-### <a name="linux"></a > hipify-clang: Linux
+### <a name="Linux"></a > hipify-clang: Linux
 
 On Linux the following configurations are tested:
 
 Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32
 
-Ubuntu 16-18: LLVM 8.0.0 - 9.0.1, CUDA 8.0 - 10.1 Update 2, cudnn-5.1.10 - cudnn-7.6.5.32
+Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc1, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
 
 Minimum build system requirements for the above configurations:
 
@@ -265,7 +255,7 @@ Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0.
 
 Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`:
 
-```shell
+```bash
 cmake
  -DHIPIFY_CLANG_TESTS=1 \
  -DCMAKE_BUILD_TYPE=Release \
@@ -409,9 +399,9 @@ LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18
 
 LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.5.32
 
-LLVM 7.0.0 - 9.0.1, CUDA 7.5 - 10.1 Update 2, cudnn 7.0.5.15 - 7.6.5.32
+LLVM 7.0.0 - 10.0.0-rc1, CUDA 7.5 - 10.2, cudnn 7.0.5.15 - 7.6.5.32
 
-Build system requirements for the latest configuration LLVM 9.0.1/CUDA 10.1 Update 2:
+Build system requirements for the latest stable configuration LLVM 9.0.1/CUDA 10.1 Update 2:
 
 Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4).
 
@@ -448,6 +438,8 @@ cmake
 -- Build files have been written to: f:/HIP/hipify-clang/build
 ```
 
+Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
+
 ## <a name="perl"></a> hipify-perl
 
 `hipify-perl` is autogenerated perl-based script which heavily uses regular expressions.

From 64ed74aaa31bfc37f0528b579ef37c858007aaa1 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Mon, 3 Feb 2020 15:29:35 +0530
Subject: [PATCH 22/23] Update Jenkinsfile

Switch using to newer --gpus all instead of older(deprecated) --runtime nvidia
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 97811bf5d9..b8bd24cd74 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -394,7 +394,7 @@ cuda_10_x:
     // Block of string constants customizing behavior for cuda
     String nvcc_ver = 'cuda-10.x'
     String from_image = 'ci_test_nodes/cuda-10.x/ubuntu-16.04:latest'
-    String inside_args = '--runtime=nvidia';
+    String inside_args = '--gpus all';
 
     // Checkout source code, dependencies and version files
     String source_hip_rel = checkout_and_version( nvcc_ver )

From 48d1f766d5a0f912c0e96ba34f8e9b28e4e186c5 Mon Sep 17 00:00:00 2001
From: gandryey <56892148+gandryey@users.noreply.github.com>
Date: Mon, 3 Feb 2020 22:23:43 -0500
Subject: [PATCH 23/23] [dtest] RCCL multi device launch test (#1731)

Simple test for hipLaunchCooperativeKernelMultiDevice API.
---
 .../module/hipLaunchCoopMultiKernel.cpp       | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp

diff --git a/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp
new file mode 100644
index 0000000000..ea0f41d631
--- /dev/null
+++ b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp
@@ -0,0 +1,212 @@
+/*
+Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Simple test for hipLaunchCooperativeKernelMultiDevice API.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM all
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <thread>
+#include <chrono>
+#include "hip/hip_cooperative_groups.h"
+#include "test_common.h"
+
+using namespace std::chrono;
+
+const static uint NumOfLoopIterrations = 16 * 1024;
+const static uint BufferSizeInDwords = 28672 * NumOfLoopIterrations;
+const static uint numQueues = 4;
+const static uint numIter = 100;
+constexpr uint NumKernelArgs = 4;
+constexpr uint MaxGPUs = 8;
+
+#include <stdio.h>
+/*
+namespace cg = cooperative_groups;
+using namespace cooperative_groups;
+*/
+
+__global__ void test_gws(uint* buf, uint bufSize, long* tmpBuf, long* result)
+{
+    extern __shared__ long tmp[];
+    uint groups = gridDim.x;
+    uint group_id = blockIdx.x;
+    uint local_id = threadIdx.x;
+    uint chunk = gridDim.x * blockDim.x;
+
+    uint i = group_id * blockDim.x + local_id;
+    long sum = 0;
+    while (i < bufSize) {
+      sum += buf[i];
+      i += chunk;
+    }
+    tmp[local_id] = sum;
+    __syncthreads();
+    i = 0;
+    if (local_id == 0) {
+        sum = 0;
+        while (i < blockDim.x) {
+          sum += tmp[i];
+          i++;
+        }
+        tmpBuf[group_id] = sum;
+    }
+
+    // wait
+    cooperative_groups::this_grid().sync();
+
+    if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) {
+        for (uint i = 1; i < groups; ++i) {
+          sum += tmpBuf[i];
+       }
+       //*result = sum;
+       result[1 + cooperative_groups::this_multi_grid().grid_rank()] = sum;
+    }
+    cooperative_groups::this_multi_grid().sync();
+    if (cooperative_groups::this_multi_grid().grid_rank() == 0) {
+      sum = 0;
+      for (uint i = 1; i <= cooperative_groups::this_multi_grid().num_grids(); ++i) {
+        sum += result[i];
+      }
+      *result = sum;
+    }
+}
+
+int main() {
+  float *A, *B;
+  uint* dA[MaxGPUs];
+  long* dB[MaxGPUs];
+  long* dC;
+  hipModule_t Module;
+  hipStream_t stream[MaxGPUs];
+
+  uint32_t* init = new uint32_t[BufferSizeInDwords];
+  for (uint32_t i = 0; i < BufferSizeInDwords; ++i) {
+    init[i] = i;
+  }
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  size_t copySizeInDwords = BufferSizeInDwords / nGpu;
+  hipDeviceProp_t deviceProp[MaxGPUs];
+
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    // Calculate the device occupancy to know how many blocks can be run concurrently
+    hipGetDeviceProperties(&deviceProp[i], 0);
+    if (!deviceProp[i].cooperativeMultiDeviceLaunch) {
+      printf("Device doesn't support cooperative launch!");
+      passed();
+      return 0;
+    }
+    size_t SIZE = copySizeInDwords * sizeof(uint);
+
+    HIPCHECK(hipMalloc((void**)&dA[i], SIZE));
+    if (i == 0) {
+      HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long), hipHostMallocCoherent));
+    }
+    HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, hipMemcpyHostToDevice));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  dim3 dimBlock;
+  dim3 dimGrid;
+  dimGrid.x = 1;
+  dimGrid.y = 1;
+  dimGrid.z = 1;
+  dimBlock.x = 64;
+  dimBlock.y = 1;
+  dimBlock.z = 1;
+
+  int numBlocks = 0;
+  uint workgroups[3] = {64, 128, 256};
+
+  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
+
+  system_clock::time_point start = system_clock::now();
+
+  for (uint set = 0; set < 3; ++set) {
+    void* args[MaxGPUs * NumKernelArgs];
+    std::cout << "---------- Test#" << set << "---------------\n";
+    for (int i = 0; i < nGpu; i++) {
+      HIPCHECK(hipSetDevice(i));
+      dimBlock.x = workgroups[set];
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
+      (hipFunction_t)test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
+      
+      std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << " Num blocks per CU: " << numBlocks << "\n";
+
+      dimGrid.x = deviceProp[i].multiProcessorCount * std::min(numBlocks, 32);
+      HIPCHECK(hipMalloc((void**)&dB[i], dimGrid.x * sizeof(long)));
+
+      args[i * NumKernelArgs]     = (void*)&dA[i];
+      args[i * NumKernelArgs + 1] = (void*)&copySizeInDwords;
+      args[i * NumKernelArgs + 2] = (void*)&dB[i];
+      args[i * NumKernelArgs + 3] = (void*)&dC;
+
+      launchParamsList[i].func = reinterpret_cast<void*>(test_gws);
+      launchParamsList[i].gridDim = dimGrid;
+      launchParamsList[i].blockDim = dimBlock;
+      launchParamsList[i].sharedMem = dimBlock.x * sizeof(long);
+      launchParamsList[i].stream = stream[i];
+      launchParamsList[i].args = &args[i * NumKernelArgs];
+    }
+ 
+    hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0);
+
+    HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost));
+
+    if (*dC != (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) {
+      std::cout << "Data validation failed for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
+      std::cout << "Test failed! \n";
+    }
+    for (int i = 0; i < nGpu; i++) {
+      hipFree(dB[i]);
+    }
+  }
+  system_clock::time_point end = system_clock::now();
+
+  delete [] launchParamsList;
+
+  std::chrono::duration<double> elapsed_seconds = end - start;
+
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ 
+  std::cout << "finished computation at " << std::ctime(&end_time) <<
+    "elapsed time: " << elapsed_seconds.count() << "s\n";
+
+  hipSetDevice(0);
+  hipFree(dC);
+  for (int i = 0; i < nGpu; i++) {
+    hipFree(dA[i]);
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+  delete [] init;
+  passed();
+  return 0;
+}