From 2cacda91bb2d1a6c868d57d24364106c2642143e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Wed, 1 Nov 2017 22:33:13 +0000
Subject: [PATCH] Correctly deal with functions from shared objects, wherein
 the program visible VA == so_base_va + st_value(function_symbol). Remove
 quaint usage of pfe for hipMemset (which is actually fill_n).

---
 src/hip_memory.cpp    | 133 +++++++++++++++++++-----------------------
 src/program_state.cpp | 108 ++++++++++++++++++++++++----------
 2 files changed, 136 insertions(+), 105 deletions(-)
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 96fc25c27d..32e0016178 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -1153,42 +1153,56 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
     return ihipLogStatus(e);
 }
 
-// TODO - make member function of stream?
+namespace
+{
+    template<
+        uint32_t block_dim,
+        typename RandomAccessIterator,
+        typename N,
+        typename T>
+    __global__
+    void hip_fill_n(RandomAccessIterator f, N n, T value)
+    {
+        const uint32_t grid_dim = hipGridDim_x;
+
+        size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
+        while (idx < n) {
+            new (&f[idx]) T{value};
+            idx += grid_dim;
+        }
+    }
+
+    template<
+        typename T,
+        typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
+    inline
+    const T& clamp_integer(const T& x, const T& lower, const T& upper)
+    {
+        assert(!(upper < lower));
+
+        return std::min(upper, std::max(x, lower));
+    }
+}
+
 template <typename T>
 void
 ihipMemsetKernel(hipStream_t stream,
-    LockedAccessor_StreamCrit_t &crit,
-    T * ptr, T val, size_t sizeBytes,
-    hc::completion_future *cf)
+    T * ptr, T val, size_t sizeBytes)
 {
-    int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
-    const int threads_per_wg = 256;
+    static constexpr uint32_t block_dim = 256;
 
-    int threads = wg * threads_per_wg;
-    if (threads > sizeBytes) {
-        threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
-    }
-
-
-    hc::extent<1> ext(threads);
-    auto ext_tile = ext.tile(threads_per_wg);
-
-    *cf =
-    hc::parallel_for_each(
-            crit->_av,
-            ext_tile,
-            [=] (hc::tiled_index<1> idx)
-            __attribute__((hc))
-    {
-        int offset = amp_get_global_id(0);
-        // TODO-HCC - change to hc_get_local_size()
-        int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
-
-        for (int i=offset; i<sizeBytes; i+=stride) {
-            ptr[i] = val;
-        }
-    });
+    const uint32_t grid_dim = clamp_integer<size_t>(
+        sizeBytes / block_dim, 1, UINT32_MAX);
 
+    hipLaunchKernelGGL(
+        hip_fill_n<block_dim>,
+        dim3(grid_dim),
+        dim3{block_dim},
+        0u,
+        stream,
+        ptr,
+        sizeBytes,
+        std::move(val));
 }
 
 
@@ -1202,17 +1216,12 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
     stream =  ihipSyncAndResolveStream(stream);
 
     if (stream) {
-        auto crit = stream->lockopen_preKernelCommand();
-
-
-        hc::completion_future cf ;
-
         if ((sizeBytes & 0x3) == 0) {
             // use a faster dword-per-workitem copy:
             try {
                 value = value & 0xff;
                 uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
+                ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
@@ -1220,19 +1229,16 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
         } else {
             // use a slow byte-per-workitem copy:
             try {
-                ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
+                ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
             }
         }
 
-        stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
-
-
         if (HIP_API_BLOCKING) {
             tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
-            cf.wait();
+            stream->locked_wait();
         }
     } else {
         e = hipErrorInvalidValue;
@@ -1253,16 +1259,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
     stream =  ihipSyncAndResolveStream(stream);
 
     if (stream) {
-        auto crit = stream->lockopen_preKernelCommand();
-
-        hc::completion_future cf ;
-
         if ((sizeBytes & 0x3) == 0) {
             // use a faster dword-per-workitem copy:
             try {
                 value = value & 0xff;
                 uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
+                ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
@@ -1270,21 +1272,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
         } else {
             // use a slow byte-per-workitem copy:
             try {
-                ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
+                ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
             }
         }
         // TODO - is hipMemset supposed to be async?
-        cf.wait();
-
-        stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
-
+        stream->locked_wait();
 
         if (HIP_LAUNCH_BLOCKING) {
             tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
-            cf.wait();
+            stream->locked_wait();
             tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
         }
     } else {
@@ -1305,17 +1304,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
     stream =  ihipSyncAndResolveStream(stream);
 
     if (stream) {
-        auto crit = stream->lockopen_preKernelCommand();
-
-        hc::completion_future cf ;
-
         size_t sizeBytes = pitch * height;
         if ((sizeBytes & 0x3) == 0) {
             // use a faster dword-per-workitem copy:
             try {
                 value = value & 0xff;
                 uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
+                ihipMemsetKernel<uint32_t> (stream,  static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
@@ -1323,20 +1318,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
         } else {
             // use a slow byte-per-workitem copy:
             try {
-                ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
+                ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
             }
         }
         // TODO - is hipMemset supposed to be async?
-        cf.wait();
-
-        stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
+        stream->locked_wait();
 
         if (HIP_LAUNCH_BLOCKING) {
             tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
-            cf.wait();
+            stream->locked_wait();
             tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
         }
     } else {
@@ -1357,36 +1350,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char  value, size_t sizeByte
     stream =  ihipSyncAndResolveStream(stream);
 
     if (stream) {
-        auto crit = stream->lockopen_preKernelCommand();
-
-        hc::completion_future cf ;
-
         if ((sizeBytes & 0x3) == 0) {
             // use a faster dword-per-workitem copy:
             try {
                 uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
+                ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
             }
             catch (std::exception &ex) {
+                std::cout << ex.what() << std::endl;
                 e = hipErrorInvalidValue;
             }
         } else {
             // use a slow byte-per-workitem copy:
             try {
-                ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
+                ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
             }
         }
-        cf.wait();
-
-        stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av);
-
+        stream->locked_wait();
 
         if (HIP_LAUNCH_BLOCKING) {
             tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
-            cf.wait();
+            stream->locked_wait();
             tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
         }
     } else {
diff --git a/src/program_state.cpp b/src/program_state.cpp
index be871a6e84..a4f7fdbdbe 100644
--- a/src/program_state.cpp
+++ b/src/program_state.cpp
@@ -195,9 +195,9 @@ namespace
             static vector<vector<uint8_t>> blobs{
                 code_object_blob_for_process()};
 
-            dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) {
+            dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
                 elfio tmp;
-                if (tmp.load(i->dlpi_name)) {
+                if (tmp.load(info->dlpi_name)) {
                     const auto it = find_section_if(tmp, [](const section* x) {
                         return x->get_name() == ".kernel";
                     });
@@ -269,6 +269,61 @@ namespace
         return r;
     }
 
+    vector<pair<uintptr_t, string>> function_names_for(
+        const elfio& reader, section* symtab)
+    {
+        vector<pair<uintptr_t, string>> r;
+        symbol_section_accessor symbols{reader, symtab};
+
+        auto foo = reader.get_entry();
+
+        for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
+            // TODO: this is boyscout code, caching the temporaries
+            //       may be of worth.
+            string name;
+            Elf64_Addr value = 0;
+            Elf_Xword size = 0;
+            Elf_Half sect_idx = 0;
+            uint8_t bind = 0;
+            uint8_t type = 0;
+            uint8_t other = 0;
+
+            symbols.get_symbol(
+                i, name, value, size, bind, type, sect_idx, other);
+
+            if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) {
+                r.emplace_back(value, name);
+            }
+        }
+
+        return r;
+    }
+
+    const vector<pair<uintptr_t, string>>& function_names_for_process()
+    {
+        static constexpr const char self[] = "/proc/self/exe";
+
+        static vector<pair<uintptr_t, string>> r;
+        static once_flag f;
+
+        call_once(f, []() {
+            elfio reader;
+
+            if (!reader.load(self)) {
+                throw runtime_error{
+                    "Failed to load the ELF file for the current process."};
+            }
+
+            auto symtab = find_section_if(reader, [](const section* x) {
+                return x->get_type() == SHT_SYMTAB;
+            });
+
+            r = function_names_for(reader, symtab);
+        });
+
+        return r;
+    }
+
     inline
     hsa_agent_t agent(hsa_executable_symbol_t x)
     {
@@ -395,43 +450,32 @@ namespace hip_impl
 {
     const unordered_map<uintptr_t, string>& function_names()
     {
-        static constexpr const char self[] = "/proc/self/exe";
-
-        static unordered_map<uintptr_t, string> r;
+        static unordered_map<uintptr_t, string> r{
+            function_names_for_process().cbegin(),
+            function_names_for_process().cend()};
         static once_flag f;
 
         call_once(f, []() {
-            elfio reader;
+            dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
+                elfio tmp;
+                if (tmp.load(info->dlpi_name)) {
+                    const auto it = find_section_if(tmp, [](const section* x) {
+                        return x->get_type() == SHT_SYMTAB;
+                    });
 
-            if (!reader.load(self)) {
-                throw runtime_error{
-                    "Failed to load the ELF file for the current process."};
-            }
+                    if (it) {
+                        auto n = function_names_for(tmp, it);
 
-            auto symtab = find_section_if(reader, [](const section* x) {
-                return x->get_type() == SHT_SYMTAB;
-            });
+                        for (auto&& f : n) f.first += info->dlpi_addr;
 
-            symbol_section_accessor symbols{reader, symtab};
-
-            for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
-                // TODO: this is boyscout code, caching the temporaries
-                //       may be of worth.
-                string name;
-                Elf64_Addr value = 0;
-                Elf_Xword size = 0;
-                Elf_Half sect_idx = 0;
-                uint8_t bind = 0;
-                uint8_t type = 0;
-                uint8_t other = 0;
-
-                symbols.get_symbol(
-                    i, name, value, size, bind, type, sect_idx, other);
-
-                if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) {
-                    r.emplace(value, name);
+                        r.insert(
+                            make_move_iterator(n.begin()),
+                            make_move_iterator(n.end()));
+                    }
                 }
-            }
+
+                return 0;
+            }, nullptr);
         });
 
         return r;