diff --git a/projects/rocshmem/internal/clients/shmem_rccl/CMakeLists.txt b/projects/rocshmem/internal/clients/shmem_rccl/CMakeLists.txt
deleted file mode 100644
index fa07c7435f..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/CMakeLists.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-###############################################################################
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-###############################################################################
-
-cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
-
-###############################################################################
-# GLOBAL COMPILE FLAGS
-###############################################################################
-set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc )
-set(CMAKE_CXX_FLAGS_DEBUG "-O0 -ggdb")
-
-###############################################################################
-# DEFAULT BUILD TYPE
-###############################################################################
-if(NOT CMAKE_BUILD_TYPE)
-  message(STATUS "CMAKE_BUILD_TYPE unspecified: generating Release build")
-
-  set(
-    CMAKE_BUILD_TYPE
-    "Release"
-    CACHE
-      STRING
-        "build type: Release, Debug, RelWithDebInfo, MinSizeRel"
-    FORCE
-  )
-endif()
-
-###############################################################################
-# PROJECT
-###############################################################################
-project(rocshmem_example_driver VERSION 1.1.0 LANGUAGES CXX)
-
-###############################################################################
-# SOURCES
-###############################################################################
-add_executable(${PROJECT_NAME} "")
-
-target_include_directories(
-  ${PROJECT_NAME}
-  PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}
-)
-
-target_sources(
-  ${PROJECT_NAME}
-  PRIVATE
-    test_driver.cpp
-    tester.cpp
-    tester_arguments.cpp
-    primitive_tester.cpp
-)
-
-###############################################################################
-# ROCSHMEM
-###############################################################################
-find_package(hip REQUIRED)
-find_package(rocshmem CONFIG REQUIRED)
-
-target_include_directories(
-  ${PROJECT_NAME}
-  PRIVATE
-    rocshmem::rocshmem
-)
-
-target_link_libraries(
-  ${PROJECT_NAME}
-  PRIVATE
-    rocshmem::rocshmem
-    hip::host
-    -fgpu-rdc
-#   xnack allows address translation fault recovery
-#   required option for managed heap configs
-#    -mxnack
-)
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/debug b/projects/rocshmem/internal/clients/shmem_rccl/build_configs/debug
deleted file mode 100755
index f7cbb1967e..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/debug
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/..
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DCMAKE_VERBOSE_MAKEFILE=ON \
-    -Drocshmem_DIR=$install_path/share/cmake/rocshmem \
-    $src_path
-cmake --build . --parallel 8
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/release b/projects/rocshmem/internal/clients/shmem_rccl/build_configs/release
deleted file mode 100755
index baa8b4277a..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/release
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/..
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -Drocshmem_DIR=$install_path/share/cmake/rocshmem \
-    $src_path
-cmake --build . --parallel 8
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.cpp b/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.cpp
deleted file mode 100644
index fc8dd91f31..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "primitive_tester.hpp"
-
-#include <rocshmem/rocshmem.hpp>
-#include <rocshmem/rocshmem_debug.hpp>
-
-#include <unistd.h>
-
-using namespace rocshmem;
-
-/******************************************************************************
- * DEVICE TEST KERNEL
- *****************************************************************************/
-__global__ void
-PrimitiveTest(int loop,
-              int *flag,
-              char *s_buf,
-              char *r_buf,
-              int size,
-              int my_pe,
-              ShmemContextType ctx_type)
-{
-    __shared__ rocshmem_ctx_t ctx;
-    rocshmem_wg_init();
-    rocshmem_wg_ctx_create(ctx_type, &ctx);
-
-    int block_id = hipBlockIdx_x;
-    for(int i =0; i< loop; i++){
-        rocshmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id);
-        if(hipThreadIdx_x==0){
-            //rocshmem_ctx_quiet(ctx);
-            //rocshmem_ctx_threadfence_system(ctx);
-            rocshmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id);
-            //rocshmem_ctx_quiet(ctx);
-            rocshmem_int_wait_until(&flag[block_id], ROCSHMEM_CMP_EQ, i+1);
-
-        }
-        __syncthreads();
-    }
-
-    rocshmem_wg_ctx_destroy(ctx);
-    rocshmem_wg_finalize();
-}
-
-/******************************************************************************
- * HOST TESTER CLASS METHODS
- *****************************************************************************/
-PrimitiveTester::PrimitiveTester(TesterArguments args)
-    : Tester(args)
-{
-    flag = (int*) rocshmem_malloc(args.numprocs);
-    memset(flag, 0, args.numprocs*sizeof(int));
-   // s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size);
-   // r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size);
-}
-
-PrimitiveTester::~PrimitiveTester()
-{
-    rocshmem_free(s_buf);
-    rocshmem_free(r_buf);
-}
-
-void
-PrimitiveTester::resetBuffers(uint64_t size)
-{
-    memset(s_buf, '0', size * args.numprocs);
-    memset(r_buf, '1', size * args.numprocs);
-}
-
-void
-PrimitiveTester::launchKernel(dim3 gridSize,
-                              dim3 blockSize,
-                              int loop,
-                              uint64_t size,
-                              int nproc, int my_pe)
-{
-
-    void* sendBuf = malloc(64);
-    void* recvBuf = malloc(64 * nproc);
-
-    s_buf = (char *)rocshmem_malloc(size * nproc);
-    r_buf = (char *)rocshmem_malloc(size * nproc);
-    resetBuffers(size);
-
-    MPI_Allgather(sendBuf, 64, MPI_CHAR,
-                  recvBuf, 64, MPI_CHAR,
-                  MPI_COMM_WORLD);
-
-    size_t shared_bytes;
-    rocshmem_dynamic_shared(&shared_bytes);
-
-    hipLaunchKernelGGL(PrimitiveTest,
-                       gridSize,
-                       blockSize,
-                       shared_bytes,
-                       stream,
-                       loop,
-                       flag,
-                       s_buf,
-                       r_buf,
-                       size,
-                       my_pe,
-                       _shmem_context);
-
-    //num_msgs = (loop + args.skip) * gridSize.x;
-    num_timed_msgs = loop ;
-}
-
-void
-PrimitiveTester::verifyResults(uint64_t size)
-{
-    int check_id =0;
-    if (args.myid == check_id) {
-        for (int i = 0; i < size*args.numprocs; i++) {
-            if (r_buf[i] != '0') {
-                fprintf(stderr, "Data validation error at idx %d\n", i);
-                fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0');
-                exit(-1);
-            }
-        }
-    }
-}
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.hpp b/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.hpp
deleted file mode 100644
index 1c5009c190..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef _PRIMITIVE_TESTER_HPP_
-#define _PRIMITIVE_TESTER_HPP_
-
-#include "tester.hpp"
-#include <mpi.h>
-
-/******************************************************************************
- * HOST TESTER CLASS
- *****************************************************************************/
-class PrimitiveTester : public Tester
-{
-  public:
-    explicit PrimitiveTester(TesterArguments args);
-    virtual ~PrimitiveTester();
-
-  protected:
-    virtual void
-    resetBuffers(uint64_t size) override;
-
-    virtual void
-    launchKernel(dim3 gridSize,
-                 dim3 blockSize,
-                 int loop,
-                 uint64_t size,
-                 int nproc, int my_pe) override;
-
-    virtual void
-    verifyResults(uint64_t size) override;
-
-    char *s_buf = nullptr;
-    char *r_buf = nullptr;
-    int *flag = nullptr;
-};
-
-#endif
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/test_driver.cpp b/projects/rocshmem/internal/clients/shmem_rccl/test_driver.cpp
deleted file mode 100644
index 511b42ac9d..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/test_driver.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include <vector>
-
-#include <rocshmem/rocshmem.hpp>
-
-#include "tester.hpp"
-#include "tester_arguments.hpp"
-
-using namespace rocshmem;
-
-int main(int argc, char * argv[])
-{
-    /**
-     * Setup the tester arguments.
-     */
-    TesterArguments args(argc, argv);
-
-    /***
-     * Select a GPU
-     */
-    int rank = rocshmem_my_pe();
-    int ndevices, my_device=0;
-    hipGetDeviceCount (&ndevices);
-    my_device = rank % ndevices;
-    hipSetDevice(my_device);
-
-    /**
-     * Must initialize rocshmem to access arguments needed by the tester.
-     */
-    rocshmem_init(args.num_wgs);
-
-    /**
-     * Now grab the arguments from rocshmem.
-     */
-    args.get_rocshmem_arguments();
-
-    /**
-     * Using the arguments we just constructed, call the tester factory
-     * method to get the tester (specified by the arguments).
-     */
-    std::vector<Tester *> tests = Tester::create(args);
-
-    /**
-     * Run the tests
-     */
-    for (auto test : tests) {
-       test->execute();
-
-    /**
-     * The tester factory method news the tester to create it so we clean
-     * up the memory here.
-     */
-       delete test;
-    }
-
-    /**
-     * The rocshmem library needs to be cleaned up with this call. It pairs
-     * with the init function above.
-     */
-    rocshmem_finalize();
-
-    return 0;
-}
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester.cpp b/projects/rocshmem/internal/clients/shmem_rccl/tester.cpp
deleted file mode 100644
index d43ee09846..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/tester.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "tester.hpp"
-
-#include <functional>
-#include <vector>
-#include <iostream>
-#include <hip/hip_runtime.h>
-#include <mpi.h>
-#include <rocshmem/rocshmem.hpp>
-
-//#include "broadcast_tester.hpp"
-#include "primitive_tester.hpp"
-
-Tester::Tester(TesterArguments args)
-    : args(args)
-{
-    _type = (TestType) args.algorithm;
-    _shmem_context = args.shmem_context;
-    hipStreamCreate(&stream);
-    hipEventCreate(&start_event);
-    hipEventCreate(&stop_event);
-    hipMalloc((void**)&timer, sizeof(uint64_t) * args.num_wgs);
-}
-
-Tester::~Tester()
-{
-    hipFree(timer);
-    hipEventDestroy(stop_event);
-    hipEventDestroy(start_event);
-    hipStreamDestroy(stream);
-}
-
-std::vector<Tester*>
-Tester::create(TesterArguments args)
-{
-    int rank = args.myid;
-    std::vector<Tester*> testers;
-
-    if (rank == 0)
-        std::cout << "*** Creating Test: ";
-
-    TestType type = (TestType) args.algorithm;
-
-    switch (type) {
-        case AlltoAll_Put:
-            if (rank == 0)
-                std::cout << "AlltoAll Puts***" << std::endl;
-            testers.push_back(new PrimitiveTester(args));
-            return testers;
-        case AlltoAll_Get:
-            if (rank == 0)
-                std::cout << "AlltoAll Gets***" << std::endl;
-            testers.push_back(new PrimitiveTester(args));
-            return testers;
-        default:
-            if (rank == 0)
-                std::cout << "Unknown***" << std::endl;
-            testers.push_back(new PrimitiveTester(args));
-            return testers;
-    }
-    return testers;
-}
-
-void
-Tester::execute()
-{
-
-    int num_loops = args.loop;
-
-    /**
-     * Some tests loop through data sizes in powers of 2 and report the
-     * results for those ranges.
-     */
-    for (uint64_t size = args.min_msg_size;
-         size <= args.max_msg_size;
-         size <<= 1) {
-
-
-        /**
-         * Restricts the number of iterations of really large messages.
-         */
-        if (size > args.large_message_size)
-            num_loops = args.loop_large;
-
-
-
-            /**
-             * TODO:
-             * Verify that this timer type is actually uint64_t on the
-             * device side.
-             */
-            memset(timer, 0, sizeof(uint64_t) * args.num_wgs);
-
-            const dim3 blockSize(args.wg_size, 1, 1);
-            const dim3 gridSize(args.num_wgs, 1, 1);
-
-            hipEventRecord(start_event, stream);
-
-            launchKernel(gridSize, blockSize, num_loops, size, args.numprocs, args.myid);
-
-            hipEventRecord(stop_event, stream);
-            hipError_t err = hipStreamSynchronize(stream);
-            if (err != hipSuccess) {
-                printf("error = %d \n", err);
-            }
-
-//            rocshmem_dump_stats();
-      //      rocshmem_reset_stats();
-
-
-
-        // data validation
-        verifyResults(size);
-
-        barrier();
-        resetBuffers(size);
-
-        print(size);
-    }
-}
-
-
-void
-Tester::print(uint64_t size)
-{
-    if (args.myid != 0) {
-        return;
-    }
-
- //   uint64_t timer_avg = timerAvgInMicroseconds();
- //   double latency_avg = static_cast<double>(timer_avg) / num_timed_msgs;
- //   double avg_msg_rate = num_timed_msgs / (timer_avg / 1e6);
-
-    float total_kern_time_ms;
-    hipEventElapsedTime(&total_kern_time_ms, start_event, stop_event);
-    float total_kern_time_s = total_kern_time_ms / 1000;
-    double bandwidth_avg_gbs = num_timed_msgs * size * bw_factor / total_kern_time_s / pow(2, 30);
-
-    float latency_us = (total_kern_time_ms *1000) /num_timed_msgs;
-
-    int field_width = 20;
-    int float_precision = 2;
-
-    printf("\n##### Message Size %lu #####\n", size);
-
-    printf("%*s%*s\n",
-           field_width + 1, "Latency AVG (us)",
-           field_width + 1, "Bandwidth (GB/s)");
-
-    printf("%*.*f %*.*f \n",
-           field_width, float_precision, latency_us,
-           field_width, float_precision, bandwidth_avg_gbs);
-
-    fflush(stdout);
-}
-
-void
-Tester::barrier()
-{
-    MPI_Barrier(MPI_COMM_WORLD);
-}
-
-uint64_t
-Tester::gpuCyclesToMicroseconds(uint64_t cycles)
-{
-    /**
-     * The dGPU asm core timer runs at 27MHz. This is different from the
-     * core clock returned by HIP. For an APU, this is different and might
-     * need adjusting.
-     */
-    uint64_t gpu_frequency_MHz = 27;
-
-    /**
-     * hipDeviceGetAttribute(&gpu_frequency_khz,
-     *                       hipDeviceAttributeClockRate,
-     *                       0);
-     */
-
-    return cycles / gpu_frequency_MHz;
-}
-
-uint64_t
-Tester::timerAvgInMicroseconds()
-{
-    uint64_t sum = 0;
-
-    for (int i = 0; i < args.num_wgs; i++) {
-       sum += gpuCyclesToMicroseconds(timer[i]);
-    }
-
-    return sum / args.num_wgs;
-}
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester.hpp b/projects/rocshmem/internal/clients/shmem_rccl/tester.hpp
deleted file mode 100644
index 831cc10064..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/tester.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef _TESTER_HPP_
-#define _TESTER_HPP_
-
-#include <vector>
-
-#include <rocshmem/rocshmem.hpp>
-
-#include "tester_arguments.hpp"
-
-/******************************************************************************
- * TESTER CLASS TYPES
- *****************************************************************************/
-enum TestType
-{
-    AlltoAll_Put             = 0,
-    AlltoAll_Get             = 1
-};
-
-typedef int ShmemContextType;
-
-/******************************************************************************
- * TESTER INTERFACE
- *****************************************************************************/
-class Tester
-{
-  public:
-    explicit Tester(TesterArguments args);
-    virtual ~Tester();
-
-    void
-    execute();
-
-    static std::vector<Tester*>
-    create(TesterArguments args);
-
-  protected:
-    virtual void
-    resetBuffers(uint64_t size) = 0;
-
-    virtual void
-    preLaunchKernel() {}
-
-    virtual void
-    launchKernel(dim3 gridSize,
-                 dim3 blockSize,
-                 int loop,
-                 uint64_t size,
-                 int nproc, int my_pe) = 0;
-
-    virtual void
-    postLaunchKernel() {}
-
-    virtual void
-    verifyResults(uint64_t size) = 0;
-
-    int num_msgs = 0;
-    int num_timed_msgs = 0;
-    int bw_factor = 1;
-
-    TesterArguments args;
-
-    TestType _type;
-    ShmemContextType _shmem_context = 8; //SHMEM_CTX_WP_PRIVATE
-
-    hipStream_t stream;
-
-    uint64_t *timer = nullptr;
-
-  private:
-    void
-    print(uint64_t size);
-
-    void
-    barrier();
-
-    uint64_t
-    gpuCyclesToMicroseconds(uint64_t cycles);
-
-    uint64_t
-    timerAvgInMicroseconds();
-
-    bool
-    peLaunchesKernel();
-
-    hipEvent_t start_event;
-    hipEvent_t stop_event;
-};
-
-#endif /* _TESTER_HPP */
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.cpp b/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.cpp
deleted file mode 100644
index 6c835169bb..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#include "tester.hpp"
-#include "tester_arguments.hpp"
-
-#include <cstdlib>
-#include <iostream>
-
-#include <rocshmem/rocshmem.hpp>
-
-using namespace rocshmem;
-
-TesterArguments::TesterArguments(int argc, char *argv[])
-{
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-        if (arg == "-w") {
-            i++;
-            num_wgs = atoi(argv[i]);
-        } else if (arg == "-S") {
-            i++;
-            max_msg_size = atoll(argv[i]);
-        } else if (arg == "-s") {
-            i++;
-            min_msg_size = atoll(argv[i]);
-        } else if (arg == "-a") {
-            i++;
-            algorithm = atoi(argv[i]);
-        } else if (arg == "-z") {
-            i++;
-            wg_size = atoi(argv[i]);
-        } else if (arg == "-x") {
-            i++;
-            shmem_context = atoi(argv[i]);
-        } else {
-            show_usage(argv[0]);
-            exit(-1);
-        }
-    }
-
-}
-
-void
-TesterArguments::show_usage(std::string executable_name)
-{
-    std::cout << "Usage: " << executable_name << std::endl;
-    std::cout << "\t-t <number of rocshmem service threads>\n";
-    std::cout << "\t-w <number of workgroups>\n";
-    std::cout << "\t-s <maximum message size (in bytes)>\n";
-    std::cout << "\t-a <algorithm number to test>\n";
-    std::cout << "\t-z <WorkGroup Size>\n";
-    std::cout << "\t-c <Coalescing Coefficient>\n";
-    std::cout << "\t-o <Operation type for the random_access test>\n";
-    std::cout << "\t-ta <Number of Thread Accessing the communication>\n";
-    std::cout << "\t-x <shmem context>\n";
-}
-
-void
-TesterArguments::get_rocshmem_arguments()
-{
-    numprocs = rocshmem_n_pes();
-    myid = rocshmem_my_pe();
-
-}
diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.hpp b/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.hpp
deleted file mode 100644
index 175470df5b..0000000000
--- a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *****************************************************************************/
-
-#ifndef _TESTER_ARGUMENTS_HPP_
-#define _TESTER_ARGUMENTS_HPP_
-
-#include <string>
-
-#include <climits>
-#include <cstdint>
-
-class TesterArguments
-{
-  public:
-    TesterArguments(int argc, char *argv[]);
-
-    /**
-     * Initialize rocshmem members
-     * Valid after rocshmem_init function called.
-     */
-    void get_rocshmem_arguments();
-
-  private:
-    /**
-     * Output method which displays available command line options
-     */
-    static void show_usage(std::string executable_name);
-
-  public:
-    /**
-     * Arguments obtained from command line
-     */
-    unsigned num_wgs = 1;
-    unsigned algorithm = 0;
-    uint64_t min_msg_size = 1;
-    uint64_t max_msg_size = 1 << 20;
-    unsigned wg_size = 64;
-    unsigned shmem_context = 8; // ROCSHMEM_CTX_WG_PRIVATE
-
-    /**
-     * Arguments obtained from rocshmem
-     */
-    unsigned numprocs = UINT_MAX;
-    unsigned myid = UINT_MAX;
-
-    /**
-     * Defaults tester values
-     */
-    int loop = 100;
-    int skip = 10;
-    int loop_large = 25;
-    int large_message_size = 32768;
-};
-
-#endif
diff --git a/projects/rocshmem/internal/clients/spts/CMakeLists.txt b/projects/rocshmem/internal/clients/spts/CMakeLists.txt
deleted file mode 100644
index 172c667776..0000000000
--- a/projects/rocshmem/internal/clients/spts/CMakeLists.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-###############################################################################
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-###############################################################################
-
-cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
-
-###############################################################################
-# GLOBAL COMPILE FLAGS
-###############################################################################
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc)
-
-###############################################################################
-# DEFAULT BUILD TYPE
-###############################################################################
-if(NOT CMAKE_BUILD_TYPE)
-  message(STATUS "CMAKE_BUILD_TYPE unspecified: generating Release build")
-
-  set(
-    CMAKE_BUILD_TYPE
-    "Release"
-    CACHE
-      STRING
-        "build type: Release, Debug, RelWithDebInfo, MinSizeRel"
-    FORCE
-  )
-endif()
-
-###############################################################################
-# PROJECT
-###############################################################################
-project(spts VERSION 1.1.0 LANGUAGES CXX)
-
-###############################################################################
-# CONFIGURATION OPTIONS
-###############################################################################
-option(USE_HIP "Build HIP version of the solver" OFF)
-option(USE_ROCSHMEM "Build rocSHMEM enabled version of the solver" OFF)
-option(ALL_ANALYZE "Build analyze and solve algorithm" OFF)
-option(USE_DOUBLE "Use double precision floats for the data" OFF)
-option(ALL_LEVELSET "Build levelset algorithm" OFF)
-option(ALL_LEVELSYNC "Build levelsync algorithm" OFF)
-option(ALL_SYNCFREE "Build syncfree algorithm" OFF)
-
-configure_file(cmake/config.h.in config.h)
-
-###############################################################################
-# SOURCES
-###############################################################################
-add_executable(${PROJECT_NAME} "")
-
-target_include_directories(
-  ${PROJECT_NAME}
-  PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>                 # CONFIG.H
-)
-
-target_sources(
-  ${PROJECT_NAME}
-  PRIVATE
-    InputFlags.cpp
-    Main.cpp
-)
-
-###############################################################################
-# HIP / HIP + rocSHMEM
-###############################################################################
-if(USE_HIP)
-  find_package(hip REQUIRED)
-
-  target_sources(
-    ${PROJECT_NAME}
-    PRIVATE
-      HIPHelper.cpp
-  )
-
-  if(USE_ROCSHMEM)
-    find_package(rocshmem CONFIG REQUIRED)
-
-    target_include_directories(
-      ${PROJECT_NAME}
-      PRIVATE
-        rocshmem::rocshmem
-    )
-
-    target_link_libraries(
-      ${PROJECT_NAME}
-      PRIVATE
-        rocshmem::rocshmem
-	hip::host
-        -fgpu-rdc
-    )
-  endif()
-
-###############################################################################
-# OPENCL
-###############################################################################
-else()
-
-  if(USE_ROCSHMEM)
-    message(FATAL_ERROR "Cannot use rocSHMEM without USE_HIP")
-  endif()
-
-  target_sources(
-    ${PROJECT_NAME}
-    PRIVATE
-      OpenCLHelper.cpp
-  )
-
-  target_include_directories(
-    ${PROJECT_NAME}
-    PRIVATE
-      /opt/rocm/opencl/include
-  )
-
-  target_link_libraries(
-    ${PROJECT_NAME}
-    PRIVATE
-      -L/opt/rocm/opencl/lib/x86_64
-      -lOpenCL
-)
-
-endif()
diff --git a/projects/rocshmem/internal/clients/spts/GPUHelper.h b/projects/rocshmem/internal/clients/spts/GPUHelper.h
deleted file mode 100644
index 7773726568..0000000000
--- a/projects/rocshmem/internal/clients/spts/GPUHelper.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#ifndef GPUHelper_H
-#define GPUHelper_H
-
-#include "config.h"
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include "InputFlags.h"
-
-#define ROW_BITS 32 // May be not the right place to define this macro
-#define WG_BITS 24
-
-static int SPTS_BLOCK_SIZE = 0;
-
-#ifdef USE_ROCSHMEM
-#define WF_PER_WG 1
-#else
-#define WF_PER_WG 16
-#endif
-#define WF_SIZE 64
-
-#ifdef USE_HIP
-    #include <hip/hip_runtime.h>
-    typedef void * memPointer;
-    typedef int memPointer_flags;
-    typedef int gpuInt;
-    typedef bool gpuBool;
-    typedef hipEvent_t gpuEvent;
-    typedef hipError_t gpuError;
-    #define GPU_MEM_READ_ONLY 0
-    #define GPU_MEM_READ_WRITE 0
-    #define GPU_MEM_USE_HOST_PTR 0
-    #define GPU_TRUE true
-    #define GPU_FALSE false
-#else
-#include <CL/cl.h>
-    typedef cl_mem memPointer;
-    typedef cl_mem_flags memPointer_flags;
-    typedef cl_int gpuInt;
-    typedef cl_bool gpuBool;
-    typedef cl_event gpuEvent;
-    typedef cl_int gpuError;
-    #define GPU_MEM_READ_ONLY CL_MEM_READ_ONLY
-    #define GPU_MEM_READ_WRITE CL_MEM_READ_ONLY
-    #define GPU_MEM_USE_HOST_PTR CL_MEM_USE_HOST_PTR
-    #define GPU_TRUE CL_TRUE
-    #define GPU_FALSE CL_FALSE
-#endif
-
-class GPUHelper
-{
-	public:
-	GPUHelper() {}
-	virtual int Init(const std::string &_filename, InputFlags &in_flags) = 0;
-	virtual void checkStatus(gpuError status, const std::string errString) = 0;
-	virtual void CopyToDevice(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev) = 0;
-	virtual void CopyToHost(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev) = 0;
-	virtual memPointer AllocateMem(const std::string name, size_t, memPointer_flags flags, void *) = 0;
-	virtual void FreeMem(memPointer ptr) = 0;
-    virtual void Flush() = 0;
-};
-
-#endif //GPUHelper_H
diff --git a/projects/rocshmem/internal/clients/spts/HIPHelper.cpp b/projects/rocshmem/internal/clients/spts/HIPHelper.cpp
deleted file mode 100644
index 57e94e655b..0000000000
--- a/projects/rocshmem/internal/clients/spts/HIPHelper.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-
-#include "HIPHelper.h"
-#include <cstring>
-#include <string>
-#include <iostream>
-
-int HIPHelper::Init(const std::string &filename, InputFlags &in_flags)
-{
-    int device = 0;
-    hipSetDevice(device);
-    hipDeviceProp_t props;
-    hipGetDeviceProperties(&props, device /*deviceID*/);
-    printf("info: running on device %s\n", props.name);
-    printf("info: architecture on AMD GPU device is: %d\n", props.gcnArch);
-
-    return 0;
-}
-
-void HIPHelper::checkStatus(gpuError status, const std::string errString)
-{
-    if (status != HIP_SUCCESS)
-    {
-        std::cerr << errString << " : " << hipGetErrorString(status) << std::endl;
-        exit(-1);
-    }
-}
-
-memPointer HIPHelper::AllocateMem(const std::string name,
-                            size_t size,
-                            memPointer_flags flags,
-                            void *hostBuffer)
-{
-    void* buf;
-    std::string errString = "HIP error allocating " + name + " !";
-    checkStatus(hipMalloc(&buf, size), errString);
-    printf("Allocating %s of size %zu at buf %p\n", name.c_str(), size, buf);
-    return buf;
-}
-
-void HIPHelper::CopyToDevice(memPointer devBuffer,
-                                void *hostBuffer,
-                                size_t size,
-                                size_t offset,
-                                gpuBool blocking,
-                                gpuEvent *ev)
-{
-    assert(offset == 0);
-   memcpy(devBuffer, hostBuffer, size);
-/*
-    if (blocking == GPU_TRUE) {
-        checkStatus(hipMemcpy(devBuffer, hostBuffer, size, hipMemcpyHostToDevice),
-                    "HIP error copying data to device !");
-    } else {
-        checkStatus(hipMemcpyAsync(devBuffer, hostBuffer, size, hipMemcpyHostToDevice),
-                    "HIP error copying data to device !");
-    }
-*/
-}
-
-void HIPHelper::CopyToHost(memPointer devBuffer,
-                                void *hostBuffer,
-                                size_t size,
-                                size_t offset,
-                                gpuBool blocking,
-                                gpuEvent *ev)
-{
-    assert(offset == 0);
-memcpy(hostBuffer, devBuffer, size);
-/*
-    if (blocking == GPU_TRUE) {
-        checkStatus(hipMemcpy(hostBuffer, devBuffer, size, hipMemcpyDeviceToHost),
-                    "HIP error copying data to device !");
-    } else {
-        checkStatus(hipMemcpyAsync(hostBuffer, devBuffer, size, hipMemcpyDeviceToHost),
-                    "HIP error copying data to device !");
-    }
-*/
-}
diff --git a/projects/rocshmem/internal/clients/spts/HIPHelper.h b/projects/rocshmem/internal/clients/spts/HIPHelper.h
deleted file mode 100644
index b7e1de1bd4..0000000000
--- a/projects/rocshmem/internal/clients/spts/HIPHelper.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#ifndef CLHelper_H
-#define CLHelper_H
-
-#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include "InputFlags.h"
-#include "GPUHelper.h"
-#include "hip/hip_runtime.h"
-
-class HIPHelper : public GPUHelper
-{
-	public:
-	HIPHelper() {}
-	int Init(const std::string &_filename, InputFlags &in_flags);
-	void checkStatus(gpuError status, const std::string errString);
-	void CopyToDevice(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev);
-	void CopyToHost(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev);
-	memPointer AllocateMem(const std::string name, size_t, memPointer_flags flags, void *);
-	void FreeMem(memPointer ptr) { hipFree(ptr); }
-    void Flush() { hipDeviceSynchronize(); }
-};
-
-#endif //CLHelper_H
-
diff --git a/projects/rocshmem/internal/clients/spts/InputFlags.cpp b/projects/rocshmem/internal/clients/spts/InputFlags.cpp
deleted file mode 100644
index 262d58d15b..0000000000
--- a/projects/rocshmem/internal/clients/spts/InputFlags.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#include <iomanip>
-#include <vector>
-#include <iostream>
-#include "InputFlags.h"
-
-InputFlags::InputFlags()
-{
-	AddInputFlag("help", 'h', "", "Print Help Message", "string");
-}
-
-void InputFlags::AddInputFlag(const std::string &_long_name,
-							char _short_name,
-							const std::string &_value,
-							const std::string &_help_text,
-							const std::string &_type)
-{
-	Input in;
-	in.long_name = _long_name;
-	in.short_name = _short_name;
-	in.value = _value;
-	in.help_text = _help_text;
-	in.type = _type;
-
-	if(MapInputs.count(_short_name) > 0)
-		printf("Input flag: %s (%c) already exists !", _long_name.c_str(), _short_name);
-	else
-		MapInputs[_short_name] = in;
-}
-
-void InputFlags::Print()
-{
-	printf("SpTS Input Flags: \n\n");
-
-	for(auto &content : MapInputs)
-		std::cout<<std::setw(8)<<"--"<<content.second.long_name<<std::setw(20 - content.second.long_name.length())<<"-"<<content.first<<std::setw(8)<<" "<<content.second.help_text<<"\n";
-	exit(0);
-}
-
-char InputFlags::FindShortName(const std::string &long_name)
-{
-	char short_name = '\0';
-
-	for(auto &content : MapInputs)
-	{
-		if(content.second.long_name == long_name)
-			short_name = content.first;
-	}
-	if(short_name == '\0')
-	{
-		std::cout<<"Long Name: "<<long_name<<" Not Found !";
-		exit(0);
-	}
-	
-	return short_name;
-}
-
-void InputFlags::Parse(int argc, char *argv[])
-{
-	std::vector<std::string> args;
-	for(int i = 1; i < argc; i++)
-		args.push_back(argv[i]);
-
-	if(args.size() == 0) // No Input Flag
-		Print();
-
-	for(int i = 0; i < args.size(); i++)
-	{
-		std::string temp = args[i];
-		if(temp[0] != '-')
-		{
-			printf("Illegal input flag\n");
-			Print();
-		}
-		else if(temp[0] == '-' && temp[1] == '-') // Long Name Input
-		{
-			std::string long_name = temp.substr(2);
-			if(long_name == "help")
-				Print();
-
-			char short_name = FindShortName(long_name);
-
-            if (short_name == 'n' || short_name == 'z' || short_name == 'v')
-            {
-                MapInputs[short_name].value = "true";
-            }
-            else
-            {
-                MapInputs[short_name].value = args[i+1];
-                i++;
-            }
-		}
-		else if (temp[0] == '-' && temp[1] == '?') // Help Input
-			Print();
-		else // Short Name Input
-		{
-			char short_name = temp[1];
-			if(MapInputs.find(short_name) == MapInputs.end())
-			{
-				std::cout<<"Input Flag: "<<short_name<<" Not Found !";
-				exit(0);
-			}
-			if(short_name == 'h')
-				Print();
-            
-            if(short_name == 'n' || short_name == 'z' || short_name == 'v' )
-            {
-                MapInputs[short_name].value = "true";
-            }
-            else
-            {
-                MapInputs[short_name].value = args[i+1];
-                i++;
-            }
-		}
-	}
-}
-
-std::string InputFlags::GetValueStr(const std::string &long_name)
-{
-	char short_name = FindShortName(long_name);
-	std::string value = MapInputs[short_name].value;
-
-	return value;
-}	
-
-int InputFlags::GetValueInt(const std::string &long_name)
-{
-	char short_name = FindShortName(long_name);
-	int value = atoi(MapInputs[short_name].value.c_str());
-
-	return value;
-}
-
-uint64_t InputFlags::GetValueUint64(const std::string &long_name)
-{
-    char short_name = FindShortName(long_name);
-    uint64_t value = strtoull(MapInputs[short_name].value.c_str(), NULL, 10);
-
-    return value;
-}
-
-float InputFlags::GetValueFloat(const std::string &long_name)
-{
-    char short_name = FindShortName(long_name);
-    float value = std::stof(MapInputs[short_name].value);
-
-    return value;
-
-}
-
-bool InputFlags::GetValueBool(const std::string &long_name)
-{
-    char short_name = FindShortName(long_name);
-    if (MapInputs[short_name].value == "true")
-        return true;
-    else
-        return false;
-}
diff --git a/projects/rocshmem/internal/clients/spts/InputFlags.h b/projects/rocshmem/internal/clients/spts/InputFlags.h
deleted file mode 100644
index 0e1dec51ae..0000000000
--- a/projects/rocshmem/internal/clients/spts/InputFlags.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#ifndef InputFlags_H
-#define InputFlags_H
-
-#include <string>
-#include <map>
-
-struct Input
-{
-    std::string long_name;
-    char short_name;
-    std::string value;
-    std::string help_text;
-    std::string type;
-};
-
-class InputFlags
-{
-    std::map<char, Input> MapInputs;
-
-    public:
-    InputFlags();
-    virtual void AddDerivedInputFlags() = 0;
-    void AddInputFlag(const std::string &_long_name,
-                    char _short_name,
-                    const std::string &_value,
-                    const std::string &_help_text,
-                    const std::string &type);
-    void Parse(int argc, char *argv[]);
-    char FindShortName(const std::string &long_name);
-    void Print();
-
-    std::string GetValueStr(const std::string &long_name);
-    int GetValueInt(const std::string &long_name);
-    uint64_t GetValueUint64(const std::string &long_name);
-    float GetValueFloat(const std::string &long_name);
-    bool GetValueBool(const std::string &long_name);
-
-    virtual ~InputFlags() {}
-};
-
-#endif //InputFlags_H
diff --git a/projects/rocshmem/internal/clients/spts/LICENSE b/projects/rocshmem/internal/clients/spts/LICENSE
deleted file mode 100644
index b1a3ae16d2..0000000000
--- a/projects/rocshmem/internal/clients/spts/LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/projects/rocshmem/internal/clients/spts/Main.cpp b/projects/rocshmem/internal/clients/spts/Main.cpp
deleted file mode 100644
index 429e1a242c..0000000000
--- a/projects/rocshmem/internal/clients/spts/Main.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-
-#include "config.h"
-
-#ifdef USE_HIP
-#include "hip/hip_runtime.h"
-#else
-#include "OpenCLHelper.h"
-#endif
-
-#ifdef USE_RO_SHMEM
-#include "mpi.h"
-#endif
-
-#include "MatrixMarketReader.h"
-#include "SpTS.h"
-#include <iostream>
-#include <unistd.h>
-#include <limits.h>
-
-#ifdef USE_DOUBLE
-typedef double FPTYPE;
-#else
-typedef float FPTYPE;
-#endif
-
-using namespace rocshmem;
-
-int main(int argc, char *argv[])
-{
-    SparseTriangularSolve<FPTYPE> spts_obj;
-    InputFlags &in_flags = spts_obj;
-    in_flags.AddDerivedInputFlags();
-    in_flags.Parse(argc, argv);
-    FPTYPE alpha = in_flags.GetValueFloat("alpha");
-
-    printf("Reading input file: %s...", in_flags.GetValueStr("filename").c_str());fflush(stdout);
-    MatrixMarketReader<FPTYPE> mm_reader;
-    if (mm_reader.MMReadFormat(in_flags.GetValueStr("filename"), in_flags) != 0)
-    {
-        fprintf(stderr, "ERROR reading input file !\n");
-        exit(1);
-    }
-    printf("Done.\n");
-
-    GPUHelper *GPU;
-#ifdef USE_HIP
-    printf("Initializing HIP runtime...\n\t");fflush(stdout);
-    GPU = new HIPHelper();
-    char buf[PATH_MAX + 1];
-    readlink("/proc/self/exe", buf, sizeof(buf) - 1);
-    std::string str(buf);
-    printf("Going to try to open %s\n", (str.substr(0, str.rfind('/'))+"/spts_kernel.cl").c_str());
-    if(GPU->Init((str.substr(0, str.rfind('/'))+ "/spts_kernel.cl").c_str(), in_flags) == 1)
-    {
-        fflush(stdout);
-        fprintf(stderr,"\nError Initializing HIP Runtime !\n");
-        exit(-1);
-    }
-#else
-    printf("Initializing OpenCL runtime...\n\t");fflush(stdout);
-    GPU = new CLHelper();
-    char buf[PATH_MAX + 1];
-    readlink("/proc/self/exe", buf, sizeof(buf) - 1);
-    std::string str(buf);
-    printf("Going to try to open %s\n", (str.substr(0, str.rfind('/'))+"/spts_kernel.cl").c_str());
-    if(GPU->Init((str.substr(0, str.rfind('/'))+ "/spts_kernel.cl").c_str(), in_flags) == 1)
-    {
-        fflush(stdout);
-        fprintf(stderr,"\nError Initializing OpenCL Runtime !\n");
-        exit(-1);
-    }
-#endif
-    printf("Done.\n");
-
-    printf("Allocating sparse matrices...");fflush(stdout);
-    spts_obj.AllocateSparseMatrix(mm_reader, in_flags, GPU);
-    printf("Done.\n");
-
-    printf("Converting COO to CSR...");fflush(stdout);
-    spts_obj.ConvertFromCOOToCSR(mm_reader.GetCoordinates(), in_flags);
-    printf("Done.\n");
-
-    SPTS_BLOCK_SIZE = in_flags.GetValueInt("block_size");
-    printf("Finding Stats For Parallel Decomposition...");fflush(stdout);
-    spts_obj.FindStatsForParallelDecomposition();
-    printf("Done.\n");
-
-    printf("Allocating parallel sparse matrices...");fflush(stdout);
-    spts_obj.AllocateParallelSparseMatrix(mm_reader, in_flags);
-    printf("Done.\n");
-
-    printf("Allocating vectors...");fflush(stdout);
-    spts_obj.AllocateVectors(mm_reader);
-    printf("Done.\n");
-
-    float gflops = 0.f;
-    int errors = 0;
-    uint64_t ns_per_iter = 0;
-    uint64_t ns_per_analysis_iter = 0;
-    uint64_t ns_per_syncfree_iter = 0;
-    uint64_t ns_per_levelset_iter = 0;
-    uint64_t ns_per_levelsync_iter = 0;
-
-    printf("Performing SpTS on the CPU with alpha=%f...", (float)alpha);fflush(stdout);
-    spts_obj.CSRSpTSCPU(alpha);
-    printf("Done.\n");
-
-    printf("Checking results of CPU-side SpTS...");fflush(stdout);
-    if (!spts_obj.CSRCheckCPU(alpha))
-    {
-        fflush(stdout);
-        fprintf(stderr, "CPU-based results were 'wrong', likely due to FP rounding. Expect the CPU and GPU to differ wildly.\n");
-        //exit(-1);
-    }
-    printf("Done.\n");
-
-    printf("Performing %d iterations of SpTS on the GPU with alpha=%f...", in_flags.GetValueInt("iterations"), (float)alpha);fflush(stdout);
-    gflops = spts_obj.CSRSpTSGPU(ns_per_iter, ns_per_analysis_iter, ns_per_syncfree_iter, ns_per_levelset_iter, ns_per_levelsync_iter, alpha);
-    printf("Done.\n");
-
-    if (in_flags.GetValueBool("verify")) {
-        printf("Checking whether GPU SpTS caused non-deterministic errors...\n");fflush(stdout);
-        int non_det_errors = spts_obj.NonDeterministicErrors();
-        printf("Done.\n");
-        if (non_det_errors)
-            fprintf(stderr, "ERROR!! -- Saw %d GPU iterations that had non-deterministic differences.\n", non_det_errors);
-        int max_errors = spts_obj.MaxErrors();
-        if (max_errors)
-        {
-            if (max_errors > 1)
-                printf(" -- %d rows differed between CPU and GPU results.\n", max_errors);
-            else
-                printf(" -- %d row differed between CPU and GPU results.\n", max_errors);
-        }
-        else
-            printf("\n");
-    }
-
-    printf("File %s : SpTS Gflops: %f ms_per_iter: %lf ", in_flags.GetValueStr("filename").c_str(), gflops, ((double)ns_per_iter/1000000.));
-    printf(" ( ms_per_analysis_iter: ");
-    if (ns_per_analysis_iter == 0)
-        printf("no_iter");
-    else
-        printf("%lf", ((double)ns_per_analysis_iter/1000000.));
-    printf(" | ms_per_syncfree_iter: ");
-    if (ns_per_syncfree_iter == 0)
-        printf("no_iter");
-    else
-        printf("%lf", ((double)ns_per_syncfree_iter/1000000.));
-    printf(" | ms_per_levelset_iter: ");
-    if (ns_per_levelset_iter == 0)
-        printf("no_iter");
-    else
-        printf("%lf", ((double)ns_per_levelset_iter/1000000.));
-    printf(" | ms_per_levelsync_iter: ");
-    if (ns_per_levelsync_iter == 0)
-        printf("no_iter )");
-    else
-        printf("%lf )", ((double)ns_per_levelsync_iter/1000000.));
-
-#ifdef USE_ROCSHMEM
-    MPI_Allreduce(MPI_IN_PLACE, (void *) &ns_per_analysis_iter, 1,
-                  MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-
-    if (spts_obj.Get_this_pe() == 0) {
-       printf("\nRANK 0: analysis avg ms = %lf\n",
-              ((double) ns_per_analysis_iter / 1000000.) / spts_obj.Get_total_pes());
-    }
-#endif
-
-    return 0;
-}
diff --git a/projects/rocshmem/internal/clients/spts/MatrixMarketReader.h b/projects/rocshmem/internal/clients/spts/MatrixMarketReader.h
deleted file mode 100644
index 00403b64ac..0000000000
--- a/projects/rocshmem/internal/clients/spts/MatrixMarketReader.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#ifndef MatrixMarketReader_H
-#define MatrixMarketReader_H
-/*
-Portions of this file include code provided by The National Institute of
-Standards and Technology (NIST).  The code includes
-macro definitions from mmio.h and is subject to the following disclaimer.
-
-Software Disclaimer
-
-NIST-developed software is provided by NIST as a public service. You may use,
-copy and distribute copies of the software in any medium, provided that you
-keep intact this entire notice. You may improve, modify and create derivative
-works of the software or any portion of the software, and you may copy and
-distribute such modifications or works. Modified works should carry a notice
-stating that you changed the software and should note the date and nature of
-any such change. Please explicitly acknowledge the National Institute of
-Standards and Technology as the source of the software.
-
-NIST-developed software is expressly provided "AS IS" NIST MAKES NO WARRANTY
-OF ANY KIND, EXPRESS, IMPLIED, IN FACT OR ARISING BY OPERATION OF LAW,
-INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT AND DATA ACCURACY. NIST
-NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE
-UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES
-NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR
-THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY,
-RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
-
-You are solely responsible for determining the appropriateness of using and
-distributing the software and you assume all risks associated with its use,
-including but not limited to the risks and costs of program errors, compliance
-with applicable laws, damage to or loss of data, programs or equipment, and
-the unavailability or interruption of operation. This software is not intended
-to be used in any situation where a failure could cause risk of injury or
-damage to property. The software developed by NIST employees is not subject
-to copyright protection within the United States.
-*/
-
-#include <string>
-#include <cstring>
-#include <fstream>
-#include <cstdio>
-#include <iostream>
-#include "InputFlags.h"
-#include <typeinfo>
-#include "mmio.h"
-
-// Class declaration
-
-template<typename FloatType>
-struct Coordinate {
-	int x;
-	int y;
-	FloatType val;
-};
-
-template <typename FloatType>
-class MatrixMarketReader
-{
-	char Typecode[4];
-	int nNZ;
-	int nRows;
-	int nCols;
-	int isSymmetric;
-	int isDoubleMem;
-	Coordinate<FloatType> *coords;
-	bool *has_seen_diag;
-
-	public:
-	MatrixMarketReader() : nNZ(0), nRows(0), nCols(0), isSymmetric(0), isDoubleMem(0)
-	{
-        for (int i = 0; i < sizeof(Typecode); i++)
-            Typecode[i] = '\0';
-		coords = NULL;
-	}
-	bool MMReadFormat(const std::string &_filename, InputFlags &_in_flags);
-	bool MMReadBanner(FILE *_infile);
-	bool MMReadMtxCrdSize(FILE *_infile);
-	void MMGenerateCOOFromFile(FILE *_infile, InputFlags &_in_flags);
-
-	int GetNumRows() { return nRows; }
-	int GetNumCols() { return nCols; }
-	int GetNumNonZeroes() { return nNZ; }
-	int GetSymmetric() { return isSymmetric; }
-
-	char *GetTypecode() { return Typecode; }
-	Coordinate<FloatType> *GetCoordinates() { return coords; }
-
-	~MatrixMarketReader() 
-	{
-		delete[] coords;
-	}
-};
-
-// Class definition
-
-template<typename FloatType>
-bool MatrixMarketReader<FloatType>::MMReadFormat(const std::string &filename, InputFlags &in_flags)
-{
-	FILE *mm_file = fopen(filename.c_str(), "r");
-	if( mm_file == NULL)
-	{
-		printf("Cannot Open Matrix-Market File !\n");
-		return 1;
-	}
-
-	int status = MMReadBanner(mm_file);
-	if(status != 0)
-	{
-		printf("Error Reading Banner in Matrix-Market File !\n");
-		return 1;
-	}
-    
-	if(! mm_is_coordinate(Typecode)) 
-	{printf(" only handling coordinate format\n"); return(1);}
-
-	if(mm_is_complex(Typecode)) {
-		printf("Error: cannot handle complex format\n");
-		return (1);
-	}
-
-	if(mm_is_symmetric(Typecode))
-		isSymmetric = 1;
-
-	status = MMReadMtxCrdSize(mm_file);
-	if(status != 0) { 
-		printf("Error reading Matrix Market crd_size %d\n",status); 
-		return(1);
-    }
-
-    if(mm_is_symmetric(Typecode))
-        coords = new Coordinate<FloatType>[nNZ+nRows];
-    else if (in_flags.GetValueBool("non_symmetric"))
-        coords = new Coordinate<FloatType>[nNZ+nRows]; // This is too large, but oh well.
-    else
-    {
-        fprintf(stderr, "Error: Input matrix is NOT symmetric. This will not work for SpTS.\n");
-        return (1);
-    }
-
-    has_seen_diag = new bool[nRows];
-    for (int i = 0; i < nRows; i++)
-        has_seen_diag[i] = false;
-
-    MMGenerateCOOFromFile(mm_file, in_flags);
-    return 0;
-}
-
-template<typename FloatType>
-void FillCoordData(char Typecode[],
-				Coordinate<FloatType> *coords, 
-				bool *has_seen_diag,
-				int &actual_nnz, 
-				int ir,
-				int ic,
-				FloatType val)
-{
-    int new_x = ir - 1;
-    int new_y = ic - 1;
-    if (new_y > new_x)
-    {
-        // Skip stuff in the upper diagonal
-        // Just keep our lower diag.
-        return;
-    }
-    if (new_y == new_x)
-        has_seen_diag[new_x] = true;
-    coords[actual_nnz].x = new_x;
-    coords[actual_nnz].y = new_y;	
-    coords[actual_nnz ++].val = val;
-}
-
-template<typename FloatType>
-void FixupMissingDiags(char Typecode[],
-                Coordinate<FloatType> *coords,
-                int &actual_nnz,
-                int nRows,
-				bool *has_seen_diag,
-                InputFlags &in_flags)
-{
-    for(int i = 0; i < nRows; i++)
-    {
-        if (has_seen_diag[i] == false)
-        {
-            coords[actual_nnz].x = i;
-            coords[actual_nnz].y = i;
-            coords[actual_nnz ++].val = 1.;
-        }
-    }
-}
-
-template<typename FloatType>
-void MatrixMarketReader<FloatType>::MMGenerateCOOFromFile(FILE *infile,
-										InputFlags &in_flags)
-{
-	int actual_nnz = 0;
-	FloatType val;
-	int ir, ic;
-
-	int exp_zeroes = in_flags.GetValueBool("exp_zeroes");
-
-	for(int i = 0; i < nNZ; i++)
-	{
-		if(mm_is_real(Typecode))
-		{
-			if(typeid(FloatType) == typeid(float))
-				fscanf(infile, "%d %d %f\n", &ir, &ic, (float*)(&val));
-			else if(typeid(FloatType) == typeid(double))
-				fscanf(infile, "%d %d %lf\n", &ir, &ic, (double*)(&val));
-
-			if(exp_zeroes == 0 && val == 0) 
-				continue;
-			else
-				FillCoordData(Typecode, coords, has_seen_diag, actual_nnz, ir, ic, val);
-		}
-		else if (mm_is_integer(Typecode))
-		{
-            if(typeid(FloatType) == typeid(float))
-                fscanf(infile, "%d %d %f\n", &ir, &ic, (float*)(&val));
-            else if(typeid(FloatType) == typeid(double))
-                fscanf(infile, "%d %d %lf\n", &ir, &ic, (double*)(&val));
-
-			if(exp_zeroes == 0 && val == 0) 
-				continue;
-			else
-				FillCoordData(Typecode, coords, has_seen_diag, actual_nnz, ir, ic, val);
-
-		}
-		else if(mm_is_pattern(Typecode))
-		{
-			fscanf(infile, "%d %d", &ir, &ic);
-			//val = ((FloatType) MAX_RAND_VAL * (rand() / (RAND_MAX + 1.0)));
-            val = 3.;
-			
-			if(exp_zeroes == 0 && val == 0) 
-				continue;
-			else
-				FillCoordData(Typecode, coords, has_seen_diag, actual_nnz, ir, ic, val);
-		}
-	}
-	FixupMissingDiags(Typecode, coords, actual_nnz, nRows, has_seen_diag, in_flags);
-	nNZ = actual_nnz;
-    printf("\n\tNNZ in the lower triangular and fixedup diagonal: %d\n", nNZ);
-}
-
-template<typename FloatType>
-bool MatrixMarketReader<FloatType>::MMReadBanner(FILE *infile)
-{
-	char line[MM_MAX_LINE_LENGTH];
-    char banner[MM_MAX_TOKEN_LENGTH];
-    char mtx[MM_MAX_TOKEN_LENGTH]; 
-    char crd[MM_MAX_TOKEN_LENGTH];
-    char data_type[MM_MAX_TOKEN_LENGTH];
-    char storage_scheme[MM_MAX_TOKEN_LENGTH];
-    char *p;
-
-    mm_clear_typecode(Typecode);  
-
-    if (fgets(line, MM_MAX_LINE_LENGTH, infile) == NULL) 
-        return MM_PREMATURE_EOF;
-
-    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
-        storage_scheme) != 5)
-        return MM_PREMATURE_EOF;
-
-    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
-    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
-    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
-    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
-
-    /* check for banner */
-    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
-        return MM_NO_HEADER;
-
-    /* first field should be "mtx" */
-    if (strcmp(mtx, MM_MTX_STR) != 0)
-        return  MM_UNSUPPORTED_TYPE;
-    mm_set_matrix(Typecode);
-
-
-    /* second field describes whether this is a sparse matrix (in coordinate
-            storgae) or a dense array */
-
-
-    if (strcmp(crd, MM_SPARSE_STR) == 0)
-        mm_set_sparse(Typecode);
-    else if (strcmp(crd, MM_DENSE_STR) == 0)
-            mm_set_dense(Typecode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* third field */
-
-    if (strcmp(data_type, MM_REAL_STR) == 0)
-        mm_set_real(Typecode);
-    else
-    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
-        mm_set_complex(Typecode);
-    else
-    if (strcmp(data_type, MM_PATTERN_STR) == 0)
-        mm_set_pattern(Typecode);
-    else
-    if (strcmp(data_type, MM_INT_STR) == 0)
-        mm_set_integer(Typecode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* fourth field */
-
-    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
-        mm_set_general(Typecode);
-    else
-    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
-        mm_set_symmetric(Typecode);
-    else
-    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
-        mm_set_hermitian(Typecode);
-    else
-    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
-        mm_set_skew(Typecode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-
-}
-
-template<typename FloatType>
-bool MatrixMarketReader<FloatType>::MMReadMtxCrdSize(FILE *infile)
-{
-	char line[MM_MAX_LINE_LENGTH];
-	int num_items_read;
-
-	/* now continue scanning until you reach the end-of-comments */
-	do 
-	{
-        if (fgets(line,MM_MAX_LINE_LENGTH, infile) == NULL) 
-            return MM_PREMATURE_EOF;
-	}while (line[0] == '%');
-
-	/* line[] is either blank or has M,N, nz */
-	if (sscanf(line, "%d %d %d", &nRows, &nCols, &nNZ) == 3)
-		return 0;
-	else
-		do
-		{ 
-			num_items_read = fscanf(infile, "%d %d %d", &nRows, &nCols, &nNZ); 
-			if (num_items_read == EOF) return MM_PREMATURE_EOF;
-		}
-		while (num_items_read != 3);
-
-	return 0;
-}
-#endif // MatrixMarketReader_H
diff --git a/projects/rocshmem/internal/clients/spts/OpenCLHelper.cpp b/projects/rocshmem/internal/clients/spts/OpenCLHelper.cpp
deleted file mode 100644
index d4d45f269f..0000000000
--- a/projects/rocshmem/internal/clients/spts/OpenCLHelper.cpp
+++ /dev/null
@@ -1,486 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#include "OpenCLHelper.h"
-#include <cstring>
-#include <string>
-#include <iostream>
-
-cl_context CLHelper::context = NULL;
-cl_command_queue CLHelper::commandQueue = NULL;
-cl_kernel CLHelper::SpTSKernel = NULL;
-cl_kernel CLHelper::SpTSKernel_analyze = NULL;
-cl_kernel CLHelper::SpTSKernel_levelset = NULL;
-cl_kernel CLHelper::SpTSKernel_scalar = NULL;
-cl_kernel CLHelper::SpTSKernel_vector = NULL;
-cl_kernel CLHelper::SpTSKernel_levelsync = NULL;
-
-const char * get_cl_err_string(cl_int err)
-{
-    switch (err)
-    {
-        case CL_SUCCESS:
-            return "CL_SUCCESS";
-        case CL_DEVICE_NOT_FOUND:
-            return "CL_DEVICE_NOT_FOUND";
-        case CL_DEVICE_NOT_AVAILABLE:
-            return "CL_DEVICE_NOT_AVAILABLE";
-        case CL_COMPILER_NOT_AVAILABLE:
-            return "CL_COMPILER_NOT_AVAILABLE";
-        case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-            return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-        case CL_OUT_OF_RESOURCES:
-            return "CL_OUT_OF_RESOURCES";
-        case CL_OUT_OF_HOST_MEMORY:
-            return "CL_OUT_OF_HOST_MEMORY";
-        case CL_PROFILING_INFO_NOT_AVAILABLE:
-            return "CL_PROFILING_INFO_NOT_AVAILABLE";
-        case CL_MEM_COPY_OVERLAP:
-            return "CL_MEM_COPY_OVERLAP";
-        case CL_IMAGE_FORMAT_MISMATCH:
-            return "CL_IMAGE_FORMAT_MISMATCH";
-        case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-            return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-        case CL_BUILD_PROGRAM_FAILURE:
-            return "CL_BUILD_PROGRAM_FAILURE";
-        case CL_MAP_FAILURE:
-            return "CL_MAP_FAILURE";
-#ifdef CL_VERSION_1_1
-        case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-            return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-            return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-#endif
-#ifdef CL_VERSION_1_2
-        case CL_COMPILE_PROGRAM_FAILURE:
-            return "CL_COMPILE_PROGRAM_FAILURE";
-        case CL_LINKER_NOT_AVAILABLE:
-            return "CL_LINKER_NOT_AVAILABLE";
-        case CL_LINK_PROGRAM_FAILURE:
-            return "CL_LINK_PROGRAM_FAILURE";
-        case CL_DEVICE_PARTITION_FAILED:
-            return "CL_DEVICE_PARTITION_FAILED";
-        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-            return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-#endif
-        case CL_INVALID_VALUE:
-            return "CL_INVALID_VALUE";
-        case CL_INVALID_DEVICE_TYPE:
-            return "CL_INVALID_DEVICE_TYPE";
-        case CL_INVALID_PLATFORM:
-            return "CL_INVALID_PLATFORM";
-        case CL_INVALID_DEVICE:
-            return "CL_INVALID_DEVICE";
-        case CL_INVALID_CONTEXT:
-            return "CL_INVALID_CONTEXT";
-        case CL_INVALID_QUEUE_PROPERTIES:
-            return "CL_INVALID_QUEUE_PROPERTIES";
-        case CL_INVALID_COMMAND_QUEUE:
-            return "CL_INVALID_COMMAND_QUEUE";
-        case CL_INVALID_HOST_PTR:
-            return "CL_INVALID_HOST_PTR";
-        case CL_INVALID_MEM_OBJECT:
-            return "CL_INVALID_MEM_OBJECT";
-        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-            return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-        case CL_INVALID_IMAGE_SIZE:
-            return "CL_INVALID_IMAGE_SIZE";
-        case CL_INVALID_SAMPLER:
-            return "CL_INVALID_SAMPLER";
-        case CL_INVALID_BINARY:
-            return "CL_INVALID_BINARY";
-        case CL_INVALID_BUILD_OPTIONS:
-            return "CL_INVALID_BUILD_OPTIONS";
-        case CL_INVALID_PROGRAM:
-            return "CL_INVALID_PROGRAM";
-        case CL_INVALID_PROGRAM_EXECUTABLE:
-            return "CL_INVALID_PROGRAM_EXECUTABLE";
-        case CL_INVALID_KERNEL_NAME:
-            return "CL_INVALID_KERNEL_NAME";
-        case CL_INVALID_KERNEL_DEFINITION:
-            return "CL_INVALID_KERNEL_DEFINITION";
-        case CL_INVALID_KERNEL:
-            return "CL_INVALID_KERNEL";
-        case CL_INVALID_ARG_INDEX:
-            return "CL_INVALID_ARG_INDEX";
-        case CL_INVALID_ARG_VALUE:
-            return "CL_INVALID_ARG_VALUE";
-        case CL_INVALID_ARG_SIZE:
-            return "CL_INVALID_ARG_SIZE";
-        case CL_INVALID_KERNEL_ARGS:
-            return "CL_INVALID_KERNEL_ARGS";
-        case CL_INVALID_WORK_DIMENSION:
-            return "CL_INVALID_WORK_DIMENSION";
-        case CL_INVALID_WORK_GROUP_SIZE:
-            return "CL_INVALID_WORK_GROUP_SIZE";
-        case CL_INVALID_WORK_ITEM_SIZE:
-            return "CL_INVALID_WORK_ITEM_SIZE";
-        case CL_INVALID_GLOBAL_OFFSET:
-            return "CL_INVALID_GLOBAL_OFFSET";
-        case CL_INVALID_EVENT_WAIT_LIST:
-            return "CL_INVALID_EVENT_WAIT_LIST";
-        case CL_INVALID_EVENT:
-            return "CL_INVALID_EVENT";
-        case CL_INVALID_OPERATION:
-            return "CL_INVALID_OPERATION";
-        case CL_INVALID_GL_OBJECT:
-            return "CL_INVALID_GL_OBJECT";
-        case CL_INVALID_BUFFER_SIZE:
-            return "CL_INVALID_BUFFER_SIZE";
-#ifdef CL_VERSION_1_1
-        case CL_INVALID_MIP_LEVEL:
-            return "CL_INVALID_MIP_LEVEL";
-        case CL_INVALID_GLOBAL_WORK_SIZE:
-            return "CL_INVALID_GLOBAL_WORK_SIZE";
-        case CL_INVALID_PROPERTY:
-            return "CL_INVALID_PROPERTY";
-#ifdef cl_ext_device_fission
-        case CL_DEVICE_PARTITION_FAILED_EXT:
-            return "CL_DEVICE_PARTITION_FAILED_EXT";
-        case CL_INVALID_PARTITION_COUNT_EXT:
-            return "CL_INVALID_PARTITION_COUNT_EXT";
-        case CL_INVALID_PARTITION_NAME_EXT:
-            return "CL_INVALID_PARTITION_NAME_EXT";
-#endif
-#endif
-#ifdef CL_VERSION_1_2
-        case CL_INVALID_IMAGE_DESCRIPTOR:
-            return "CL_INVALID_IMAGE_DESCRIPTOR";
-        case CL_INVALID_COMPILER_OPTIONS:
-            return "CL_INVALID_COMPILER_OPTIONS";
-        case CL_INVALID_LINKER_OPTIONS:
-            return "CL_INVALID_LINKER_OPTIONS";
-        case CL_INVALID_DEVICE_PARTITION_COUNT:
-            return "CL_INVALID_DEVICE_PARTITION_COUNT";
-#endif
-#ifdef CL_VERSION_2_0
-        case CL_INVALID_PIPE_SIZE:
-            return "CL_INVALID_PIPE_SIZE";
-        case CL_INVALID_DEVICE_QUEUE:
-            return "CL_INVALID_DEVICE_QUEUE";
-#endif
-#ifdef CL_VERSION_2_2
-        case CL_INVALID_SPEC_ID:
-            return "CL_INVALID_SPEC_ID";
-        case CL_MAX_SIZE_RESTRICTION_EXCEEDED:
-            return "CL_MAX_SIZE_RESTRICTION_EXCEEDED";
-#endif
-#ifdef cl_khr_icd
-        case CL_PLATFORM_NOT_FOUND_KHR:
-            return "CL_PLATFORM_NOT_FOUND_KHR";
-#endif
-        default:
-            return "UNKNOWN CL ERROR CODE";
-    }
-}
-
-void convertToStr(char **source, size_t* sourceSize, const std::string fname)
-{
-    FILE *fp = fopen(fname.c_str(), "r");
-    fseek(fp, 0, SEEK_END);
-    *sourceSize = ftell(fp);
-    fseek(fp , 0, SEEK_SET);
-    *source = (char *)malloc(*sourceSize * sizeof(char));
-    fread(*source, 1, *sourceSize, fp);
-    fclose(fp);
-
-}
-
-int CLHelper::Init(const std::string &filename, InputFlags &in_flags)
-{
-    cl_int status = 0;
-    size_t deviceListSize;
-    unsigned int i;
-
-    /*
-     * Have a look at the available platforms and pick either
-     * the AMD one if available or a reasonable default.
-     */
-    cl_uint numPlatforms;
-    platform = NULL;
-    status = clGetPlatformIDs(0, NULL, &numPlatforms);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"clGetPlatformIDs failed. %u",numPlatforms);
-        return 1;
-    }
-    if (0 < numPlatforms) 
-    {
-        cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
-        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-        if(status != CL_SUCCESS)
-        {
-            fprintf(stderr, "clGetPlatformIDs failed: %s\n", get_cl_err_string(status) );
-            return 1;
-        }
-        for (i = 0; i < numPlatforms; ++i) 
-        {
-            char pbuf[100];
-            status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);
-
-            if(status != CL_SUCCESS)
-            {
-                fprintf(stderr,"clGetPlatformInfo failed: %s\n", get_cl_err_string(status));
-                return 1;
-            }
-
-            platform = platforms[i];
-            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
-            {
-                break;
-            }
-        }
-        free(platforms);
-    }
-
-    /////////////////////////////////////////////////////////////////
-    // Create an OpenCL context
-    /////////////////////////////////////////////////////////////////
-
-    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
-    cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
-    context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
-    if(status != CL_SUCCESS)
-    {
-        printf("status: %d",  status);
-        fprintf(stderr,"Error: Creating Context. (clCreateContextFromType): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-    /* First, get the size of device list data */
-    status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(size_t), &deviceListSize, NULL);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Getting Context Info (device list size, clGetContextInfo): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    /////////////////////////////////////////////////////////////////
-    // Detect OpenCL devices
-    /////////////////////////////////////////////////////////////////
-    devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
-    if(devices == 0)
-    {
-        fprintf(stderr,"Error: No devices found: %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    /* Now, get the device list data */
-    status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize*sizeof(cl_device_id), devices, NULL);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Getting Context Info (device list, clGetContextInfo): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    char *deviceName;
-    size_t dev_name_size = 0;
-
-    int deviceNum = in_flags.GetValueInt("device");
-
-    clGetDeviceInfo(devices[deviceNum], CL_DEVICE_NAME, sizeof(char*), NULL, &dev_name_size);
-    deviceName = (char *)malloc(sizeof(char)*dev_name_size);
-
-    clGetDeviceInfo(devices[deviceNum], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
-    printf("Device Name: %s\n", deviceName);
-
-    bool use_gcn3 = false;
-    bool use_gcn2 = false;
-    char *found_gfx8 = strstr(deviceName, "gfx8");
-    char *found_gfx7 = strstr(deviceName, "gfx7");
-    if (found_gfx8 != NULL)
-        use_gcn3 = true;
-    if (found_gfx7 != NULL)
-        use_gcn2 = true;
-
-    free(deviceName);
-
-    /////////////////////////////////////////////////////////////////
-    // Create an OpenCL command queue
-    /////////////////////////////////////////////////////////////////
-    commandQueue = clCreateCommandQueue(context, devices[deviceNum], CL_QUEUE_PROFILING_ENABLE, &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Creating Command Queue. (clCreateCommandQueue): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    /////////////////////////////////////////////////////////////////
-    // Load CL file, build CL program object, create CL kernel object
-    /////////////////////////////////////////////////////////////////
-    char* source;
-    size_t sourceSize;
-    convertToStr(&source, &sourceSize, filename);
-    
-    syncfree_program = clCreateProgramWithSource(context, 1, (const char**)&source, &sourceSize, &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Loading Binary into cl_program (clCreateProgramWithBinary): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-    analyze_levelset_program = clCreateProgramWithSource(context, 1, (const char**)&source, &sourceSize, &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Loading Binary into cl_program (clCreateProgramWithBinary): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    std::string buildFlags = "-x clc++ -Dcl_khr_int64_base_atomics=1 -cl-std=CL2.0";
-    if (use_gcn3)
-        buildFlags += " -DGCN3 ";
-    if (use_gcn2)
-        buildFlags += " -DGCN2 ";
-    buildFlags += " -DROW_BITS=" + std::to_string(ROW_BITS);
-    buildFlags += " -DWG_BITS=" + std::to_string(WG_BITS);
-    buildFlags += " -DWF_SIZE=" + std::to_string(WF_SIZE);
-    buildFlags += " -DWF_PER_WG=" + std::to_string(WF_PER_WG);
-#ifdef USE_DOUBLE
-    buildFlags += " -DDOUBLE";
-#endif
-    
-    /* create a cl program executable for all the devices specified */
-    status = clBuildProgram(analyze_levelset_program, 1, &devices[deviceNum], buildFlags.c_str(), NULL, NULL);
-    if(status != CL_SUCCESS)
-    {
-        printf("Error: Building Analyze and Levelset Program (clBuildProgram): %d\n", status);
-        char * errorbuf = (char*)calloc(sizeof(char),1024*1024);
-        size_t size;
-        clGetProgramBuildInfo(analyze_levelset_program, devices[deviceNum], CL_PROGRAM_BUILD_LOG, 1024*1024, errorbuf, &size);
-        printf("%s ", errorbuf);
-        return 1;
-    }
-
-    buildFlags += " -DSYNCFREE_KERNEL";
-    status = clBuildProgram(syncfree_program, 1, &devices[deviceNum], buildFlags.c_str(), NULL, NULL);
-    if(status != CL_SUCCESS)
-    {
-        printf("Error: Building Syncfree Program (clBuildProgram): %d\n", status);
-        char * errorbuf = (char*)calloc(sizeof(char),1024*1024);
-        size_t size;
-        clGetProgramBuildInfo(syncfree_program, devices[deviceNum], CL_PROGRAM_BUILD_LOG, 1024*1024, errorbuf, &size);
-        printf("%s ", errorbuf);
-        return 1;
-    }
-
-    SpTSKernel = clCreateKernel(syncfree_program, "amd_spts_syncfree_solve", &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Creating Kernel from program. (SpTS): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    SpTSKernel_analyze = clCreateKernel(analyze_levelset_program, "amd_spts_analyze_and_solve", &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Creating Kernel from program. (SpTS_analyze): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    SpTSKernel_levelset = clCreateKernel(analyze_levelset_program, "amd_spts_levelset_solve", &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Creating Kernel from program. (SpTS_levelset): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    SpTSKernel_scalar = clCreateKernel(analyze_levelset_program, "amd_spts_scalar_solve", &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Creating Kernel from program. (SpTS_scalar): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    SpTSKernel_vector = clCreateKernel(analyze_levelset_program, "amd_spts_vector_solve", &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Creating Kernel from program. (SpTS_vector): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    SpTSKernel_levelsync = clCreateKernel(analyze_levelset_program, "amd_spts_levelsync_solve", &status);
-    if(status != CL_SUCCESS)
-    {
-        fprintf(stderr,"Error: Creating Kernel from program. (SpTS_levelsync): %s\n", get_cl_err_string(status));
-        return 1;
-    }
-
-    // All good
-    return 0;
-}
-
-void CLHelper::checkStatus(cl_int status, const std::string errString)
-{
-    if (status != CL_SUCCESS)
-    {
-        std::cerr << errString << " : " << get_cl_err_string(status) << std::endl;
-        exit(-1);
-    }
-}
-
-memPointer CLHelper::AllocateMem(const std::string name,
-                            size_t size, 
-                            memPointer_flags flags, 
-                            void *hostBuffer) 
-{
-    cl_mem buf;
-    cl_int status;
-
-    buf = clCreateBuffer(context, flags, size, hostBuffer, &status);
-    std::string errString = "OpenCL error allocating " + name + " !";
-    checkStatus(status, errString);
-
-    return buf;
-}
-
-void CLHelper::CopyToDevice(memPointer devBuffer, 
-                                void *hostBuffer,
-                                size_t size,
-                                size_t offset,
-                                cl_bool blocking,
-                                cl_event *ev)
-{
-    cl_int status;
-    status = clEnqueueWriteBuffer(commandQueue, devBuffer, blocking, offset, size, hostBuffer, 0, NULL, ev);
-
-    checkStatus(status, "OpenCL error copying data to device !");
-}
-
-void CLHelper::CopyToHost(memPointer devBuffer, 
-                                void *hostBuffer,
-                                size_t size,
-                                size_t offset,
-                                cl_bool blocking,
-                                cl_event *ev)
-{
-    cl_int status;
-    status = clEnqueueReadBuffer(commandQueue, devBuffer, blocking, offset, size, hostBuffer, 0, NULL, ev);
-
-    checkStatus(status, "OpenCL error copying data to device !");
-}
-
-int64_t CLHelper::ComputeTime(cl_event event)
-{
-    int64_t start_time, end_time;
-
-    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(int64_t), &start_time, NULL);
-    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(int64_t), &end_time, NULL);
-
-    return end_time - start_time;
-}
diff --git a/projects/rocshmem/internal/clients/spts/OpenCLHelper.h b/projects/rocshmem/internal/clients/spts/OpenCLHelper.h
deleted file mode 100644
index 49a8b83646..0000000000
--- a/projects/rocshmem/internal/clients/spts/OpenCLHelper.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#ifndef CLHelper_H
-#define CLHelper_H
-
-#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-
-#include <CL/cl.h>
-#include <string>
-#include <iostream>
-#include <sstream>
-#include "InputFlags.h"
-#include "GPUHelper.h"
-
-struct LocalMemArg 
-{
-	LocalMemArg(size_t _size) : size(_size) {}
-	size_t GetSize() const { return size; }
-
-	private:
-	size_t size;
-};
-
-class CLHelper : public GPUHelper
-{
-	cl_platform_id platform;
-	cl_device_id *devices;
-	cl_program syncfree_program;
-    cl_program analyze_levelset_program;
-
-	public:
-	static cl_context context;
-	static cl_kernel SpTSKernel;
-    static cl_kernel SpTSKernel_analyze;
-    static cl_kernel SpTSKernel_levelset;
-    static cl_kernel SpTSKernel_scalar;
-    static cl_kernel SpTSKernel_vector;
-    static cl_kernel SpTSKernel_levelsync;
-	static cl_command_queue commandQueue;
-
-	CLHelper() {}
-	int Init(const std::string &_filename, InputFlags &in_flags);
-	void checkStatus(gpuError status, const std::string errString);
-	void CopyToDevice(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, cl_bool _blocking, cl_event *_ev);
-	void CopyToHost(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, cl_bool _blocking, cl_event *_ev);
-	memPointer AllocateMem(const std::string name, size_t, memPointer_flags flags, void *);
-	void FreeMem(memPointer ptr) { clReleaseMemObject(ptr); }
-    void Flush() { clFinish(commandQueue); }
-
-	template<typename T, typename... Args>
-	void SetArgs(cl_kernel, int i, const T& first, const Args&... rest);
-	template<typename... Args>
-	void SetArgs(cl_kernel, int i, const LocalMemArg &lmem, const Args&... rest);
-	void SetArgs(cl_kernel, int i) {}
-
-	int64_t ComputeTime(cl_event event);
-
-};
-
-template<typename T, typename... Args>
-void CLHelper::SetArgs(cl_kernel kernel, int i, const T& first, const Args&... rest)
-{
-	cl_int status;
-
-	status = clSetKernelArg(kernel, i++, sizeof(T), (void *)& first);
-	std::stringstream errStream;
-	errStream<<"OpenCL error setting kernel argument "<<i;
-	checkStatus(status, errStream.str()) ;
-
-	SetArgs(kernel, i, rest...);
-}
-
-template<typename... Args>
-void CLHelper::SetArgs(cl_kernel kernel, int i, const LocalMemArg &lmem, const Args&... rest)
-{
-	cl_int status;
-	status = clSetKernelArg(kernel, i++, lmem.GetSize(), NULL);
-	std::stringstream errStream;
-	errStream<<"OpenCL error setting kernel argument (local memory) "<<i;
-	checkStatus(status, errStream.str()) ;
-
-	SetArgs(kernel, i, rest...);
-
-}
-
-#endif //CLHelper_H
-
diff --git a/projects/rocshmem/internal/clients/spts/SpTS.h b/projects/rocshmem/internal/clients/spts/SpTS.h
deleted file mode 100644
index ccfaaed27f..0000000000
--- a/projects/rocshmem/internal/clients/spts/SpTS.h
+++ /dev/null
@@ -1,1599 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#ifndef SpTS_H
-#define SpTS_H
-#define TEST_NUM 9999999999ULL
-
-#include "InputFlags.h"
-#include "SparseMatrix.h"
-
-#include "MatrixMarketReader.h"
-#include <vector>
-#include <float.h>
-#include <unordered_map>
-#include <cmath>
-#include <chrono>
-#include <thread>
-
-#include <time.h>
-#include <sys/time.h>
-
-#include <unistd.h>
-
-
-#ifdef USE_ROCSHMEM
-#include "rocshmem.hpp"
-#include "mpi.h"
-#endif
-
-#ifdef USE_HIP
-#include "spts_kernel.h"
-#endif
-
-#ifdef DBL_DECIMAL_DIG
-    #define OP_DBL_Digs (DBL_DECIMAL_DIG)
-#else
-    #ifdef DECIMAL_DIG
-        #define OP_DBL_Digs (DECIMAL_DIG)
-    #else
-        #define OP_DBL_Digs (DBL_DIG + 3)
-    #endif
-#endif
-
-#ifdef FLT_DECIMAL_DIG
-  #define OP_FLT_Digs (FLT_DECIMAL_DIG)
-#else
-  #ifdef DECIMAL_DIG
-    #define OP_FLT_Digs (DECIMAL_DIG)
-  #else
-    #define OP_FLT_Digs (FLT_DIG + 3)
-  #endif
-#endif
-
-using namespace rocshmem;
-
-template<typename FloatType>
-class SparseTriangularSolve :
-    public InputFlags, public SparseMatrix<FloatType>
-{
-    FloatType *x;
-    FloatType *y;
-    FloatType *y_zero;
-    FloatType *yref;
-    std::vector<uint64_t> rowBlocks;
-
-    memPointer xDev;
-    memPointer yDev;
-    memPointer completedRowsDev;
-    memPointer rowBlocksDev;
-    memPointer doneArrayDev;
-    memPointer shadowDoneArrayDev;
-    memPointer remoteInProgressArrayDev;
-    memPointer reqUpdateArrayDev;
-    memPointer numRowsAtLevelDev;
-    memPointer maxDepthDev;
-    memPointer rowMapDev;
-    memPointer totalSpinDev;
-    memPointer oneBufDev;
-
-    int nNZ;
-    int nRows;
-    int nCols;
-    int numBlocks;
-/*
-    #ifdef USE_ROCSHMEM
-    rocshmem_t* handle;
-    #endif
-*/
-    std::unordered_map<int, FloatType> *observed_errors;
-    int *errors_seen;
-
-    public:
-
-    SparseTriangularSolve() : nNZ(0), nRows(0), nCols(0), numBlocks(0)
-    {
-        x = NULL; y = NULL; y_zero = NULL, yref = NULL, observed_errors = NULL, errors_seen = NULL;
-        xDev = yDev = completedRowsDev = remoteInProgressArrayDev = rowBlocksDev =  doneArrayDev = shadowDoneArrayDev = numRowsAtLevelDev = maxDepthDev = rowMapDev = totalSpinDev = oneBufDev = 0;
-
-        #ifdef USE_ROCSHMEM
-	int rocshmem_queues = (2560 / WF_PER_WG);
-       	if (2560 % WF_PER_WG)
-            rocshmem_queues++;
-	printf("rocshmem_queues %d WF_PER_WG %d  \n",rocshmem_queues, WF_PER_WG);
-        rocshmem_init(rocshmem_queues);
-
-        this->Set_total_pes(rocshmem_n_pes());
-        this->Set_this_pe(rocshmem_my_pe());
-        #else
-        this->Set_total_pes(1);
-        this->Set_this_pe(0);
-        #endif
-    }
-
-    void AddDerivedInputFlags();
-    void AllocateVectors(MatrixMarketReader<FloatType> &mm_reader);
-    void CSRSpTSCPU(FloatType alpha);
-    bool CSRCheckCPU(FloatType alpha);
-
-    float CSRSpTSGPU(uint64_t &ns_per_iter, uint64_t &ns_per_analysis_iter, uint64_t &ns_per_syncfree_iter, uint64_t &ns_per_levelset_iter, uint64_t &ns_per_levelsync_iter, FloatType alpha);
-
-    int VerifyResults(int);
-    int NonDeterministicErrors();
-    int MaxErrors();
-    int ComputeRowBlocks(std::vector<uint64_t> &, int *, int);
-
-    ~SparseTriangularSolve()
-    {
-        if (x != NULL)
-            delete[] x;
-        if (y != NULL)
-            delete[] y;
-        if (y_zero != NULL)
-            delete[] y_zero;
-        if (yref != NULL)
-            delete[] yref;
-        if (errors_seen != NULL)
-            delete[] errors_seen;
-
-        if (xDev != 0)
-            this->GPU->FreeMem(xDev);
-        if (rowBlocksDev != 0)
-            this->GPU->FreeMem(rowBlocksDev);
-        if (completedRowsDev != 0)
-            this->GPU->FreeMem(completedRowsDev);
-        if (numRowsAtLevelDev != 0)
-            this->GPU->FreeMem(numRowsAtLevelDev);
-        if (maxDepthDev != 0)
-            this->GPU->FreeMem(maxDepthDev);
-        if (rowMapDev != 0)
-            this->GPU->FreeMem(rowMapDev);
-        if (totalSpinDev != 0)
-            this->GPU->FreeMem(totalSpinDev);
-        if (oneBufDev != 0)
-            this->GPU->FreeMem(oneBufDev);
-        if (remoteInProgressArrayDev != 0)
-            this->GPU->FreeMem(remoteInProgressArrayDev);
-
-        #ifndef USE_ROCSHMEM
-        if (yDev != 0)
-            this->GPU->FreeMem(yDev);
-        if (doneArrayDev != 0)
-            this->GPU->FreeMem(doneArrayDev);
-        if (reqUpdateArrayDev != 0)
-            this->GPU->FreeMem(reqUpdateArrayDev);
-        if (shadowDoneArrayDev != 0)
-            this->GPU->FreeMem(shadowDoneArrayDev);
-        #else
-        if (yDev != 0)
-            rocshmem_free(yDev);
-        if (doneArrayDev != 0)
-            rocshmem_free(doneArrayDev);
-        if (reqUpdateArrayDev != 0)
-            rocshmem_free(reqUpdateArrayDev);
-        if (shadowDoneArrayDev != 0)
-            rocshmem_free(shadowDoneArrayDev);
-        rocshmem_finalize();
-        #endif
-    }
-};
-
-    template<typename FloatType>
-void SparseTriangularSolve<FloatType>::AddDerivedInputFlags()
-{
-    AddInputFlag("filename", 'f', "", "Matrix-Market File", "string");
-    AddInputFlag("iterations", 'i', "10", "Number of SpTS Iterations (Default=10)", "int");
-    AddInputFlag("exp_zeroes", 'z', "false", "Include Explicit Zeroes in Matrix-Market File (Default=false)", "bool");
-    AddInputFlag("device", 'd', "0", "Choose the GPU to Execute SpTS (Default=0)", "int");
-    AddInputFlag("alpha", 'A', "1.0", "A*y=alpha*x. Known vector 'x' is multiplied by scalar alpha befoer solving for vector 'y'. (Default=1.0)", "float");
-    AddInputFlag("non_symmetric", 'n', "false", "Force the program to work on non-symmetric matrices. This will ignore the upper triangular entirely. (Default=false)", "bool");
-    AddInputFlag("levelsync_size", 'l', "0", "Number of rows to launch in a level-sync kernel invocation (Default = auto-tune)", "int");
-    AddInputFlag("verify", 'v', "false", "Verify results", "bool");
-    AddInputFlag("rocshmem_algorithm", 'a', "0", "rocSHMEM algorithm type", "int");
-    AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for rocSHMEM", "int");
-	AddInputFlag("put_block_size", 'p', "1024", "Block size for puts", "int");
-	AddInputFlag("get_backoff_factor", 'g', "128", "Backoff factor for gets", "int");
-}
-
-    template<typename FloatType>
-void SparseTriangularSolve<FloatType>::AllocateVectors(
-        MatrixMarketReader<FloatType> &mm_reader)
-{
-    nRows = mm_reader.GetNumRows();
-    nCols = mm_reader.GetNumCols();
-    nNZ = mm_reader.GetNumNonZeroes();
-
-    x = new FloatType[nCols];
-    y = new FloatType[nRows];
-    y_zero = new FloatType[nRows];
-    yref = new FloatType[nRows];
-    observed_errors = new std::unordered_map<int, FloatType>[InputFlags::GetValueInt("iterations")];
-
-    for(int i = 0; i < nRows; i++)
-    {
-        y[i] = (FloatType)0.0;
-        y_zero[i] = (FloatType)0.0;
-        yref[i] = (FloatType)0.0;
-    }
-
-    for(int i = 0; i < nCols; i++)
-    {
-        //x[i] = (FloatType)rand() / (FloatType)RAND_MAX;
-        x[i] = 2.;
-    }
-
-    xDev = this->GPU->AllocateMem("xDev", nCols*sizeof(FloatType), GPU_MEM_READ_ONLY, NULL);
-    #ifndef USE_ROCSHMEM
-    yDev = this->GPU->AllocateMem("yDev", nRows*sizeof(FloatType), GPU_MEM_READ_WRITE, NULL);
-    #else
-    yDev = (memPointer) rocshmem_malloc(nRows*sizeof(FloatType));
-    #endif
-}
-
-    template<typename FloatType>
-void SparseTriangularSolve<FloatType>::CSRSpTSCPU(FloatType alpha)
-{
-    FloatType *NZvalues = SparseMatrix<FloatType>::GetVals();
-    int *Cols = SparseMatrix<FloatType>::GetCols();
-    int *rowptrs = SparseMatrix<FloatType>::GetRowPtrs();
-    double internal_alpha = alpha;
-
-    uint64_t local_nnz = 0;
-    uint64_t remote_nnz = 0;
-    uint64_t rows_with_nonlocal = 0;
-
-    for(int i = 0; i < nRows; i++)
-    {
-        bool row_has_nonlocal = false;
-        double diagonal = 0.;
-        double temp = 0.;
-        int diag_j = -1;
-        for(int j = rowptrs[i]; j < rowptrs[i+1]; j++)
-        {
-            int ci = Cols[j];
-            int row_pe = (i / SPTS_BLOCK_SIZE) % this->Get_total_pes();
-            int col_pe = (ci / SPTS_BLOCK_SIZE) % this->Get_total_pes();
-
-            int assigned_pe = (i / SPTS_BLOCK_SIZE) % this->Get_total_pes();
-            if (assigned_pe == this->Get_this_pe()) {
-                if (row_pe == col_pe) {
-                    local_nnz++;
-                } else {
-                    row_has_nonlocal = true;
-                    remote_nnz++;
-                }
-            }
-
-            // Skip adding in the diagonal. We need to solve for that.
-            if (ci != i)
-            {
-                if (i == TEST_NUM)
-                    fprintf(stderr, "NZvalues[%d](%lf) * yref[%d](%lf)\n", j, NZvalues[j], ci, yref[ci]);
-                temp += NZvalues[j] * yref[ci];
-            }
-            else
-            {
-                if (i==TEST_NUM)
-                    fprintf(stderr, "\t\tDIAG = %lf\n", NZvalues[j]);
-                diagonal = NZvalues[j];
-                diag_j = j;
-            }
-        }
-        if (row_has_nonlocal) rows_with_nonlocal++;
-        if (diag_j == -1)
-        {
-            fflush(stdout);
-            printf("\nERROR in SpTS CPU\n");
-            printf("No diagonal found in row %d\n", i);
-        }
-        // y = (x-sum_of_vals_from_A) / diag
-        double alpha_x = internal_alpha * (double)x[i];
-        if (i == TEST_NUM)
-        {
-            char buf[128];
-            char buf2[128];
-            char buf3[128];
-            snprintf(buf, sizeof(buf), "%.20f", alpha_x);
-            snprintf(buf2, sizeof(buf2), "%.20f", internal_alpha);
-            fprintf(stderr, "alpha_x: %s (%s * %lf)\n", buf, buf2, x[i]);
-            snprintf(buf3, sizeof(buf3), "%.20f", temp);
-            fprintf(stderr, "temp: %s\n", buf3);
-        }
-        yref[i] = (FloatType)((alpha_x - temp)/diagonal);
-        if (i == TEST_NUM)
-            fprintf(stderr, "\tsupposed answer [%d]: %lf\n", i, yref[i]);
-    }
-    double ratio = ((double) local_nnz) / ((double) remote_nnz + local_nnz);
-    double rows_remote_ratio = ((double) rows_with_nonlocal) / ((double) this->nRows_p);
-    if (this->Get_this_pe() == 0) {
-        printf("\nRANK 0: global NNZ = %lu\n", remote_nnz + local_nnz);
-        printf("RANK 0: global Rows = %d\n", nRows);
-    }
-    printf("\nLOCALITY  %d : Remote/Local cols %lu/%lu Fraction Columns Local %f Fraction Rows with Remote Columns %f\n", this->Get_this_pe(), remote_nnz, local_nnz, ratio, rows_remote_ratio);
-}
-
-    template<typename FloatType>
-bool SparseTriangularSolve<FloatType>::CSRCheckCPU(FloatType alpha)
-{
-    FloatType *NZvalues = SparseMatrix<FloatType>::GetVals();
-    int *Cols = SparseMatrix<FloatType>::GetCols();
-    int *rowptrs = SparseMatrix<FloatType>::GetRowPtrs();
-    double internal_alpha = alpha;
-    bool all_worked = true;
-
-#pragma omp parallel for
-    for(int i = 0; i < nRows; i++)
-    {
-#pragma omp flush (all_worked)
-        if (all_worked)
-        {
-            double temp = 0.;
-            for(int j = rowptrs[i]; j < rowptrs[i+1]; j++)
-            {
-                int ci = Cols[j];
-                // Skip anything that lies on the diagonal. We need to solve for that.
-                temp += NZvalues[j] * yref[ci];
-            }
-            double compare_val = 0.;
-            double alpha_x = internal_alpha * x[i];
-            if(typeid(FloatType) == typeid(float))
-            {
-                compare_val = fabs(alpha_x*1e-3);
-                if (compare_val < 10*FLT_EPSILON)
-                    compare_val = 10*FLT_EPSILON;
-                if ((FloatType)(alpha_x - compare_val) > (FloatType)temp || (FloatType)(alpha_x + compare_val) < (FloatType)temp)
-                {
-                    fflush(stdout);
-                    fprintf(stderr, " CPU CALCULATION ERROR on row %d\n", i);
-                    fprintf(stderr, "\tReal value for row %d: %.*e\n", i, OP_FLT_Digs-1, (float)alpha_x);
-                    fprintf(stderr, "\tCalculated value for row %d: %.*e\n", i, OP_FLT_Digs-1, (float)temp);
-                    all_worked = false;
-#pragma omp flush (all_worked)
-                }
-            }
-            else if(typeid(FloatType) == typeid(double))
-            {
-                compare_val = fabs(alpha_x*1e-4);
-                if (compare_val < 10*DBL_EPSILON)
-                    compare_val = 10*DBL_EPSILON;
-                if ((FloatType)(alpha_x - compare_val) > (FloatType)temp || (FloatType)(alpha_x + compare_val) < (FloatType)temp)
-                {
-                    fflush(stdout);
-                    fprintf(stderr, " CPU CALCULATION ERROR on row %d\n", i);
-                    fprintf(stderr, "\tReal value for row %d: %.*le\n", i, OP_DBL_Digs-1, (double)alpha_x);
-                    fprintf(stderr, "\tCalculated value for row %d: %.*le\n", i, OP_DBL_Digs-1, (double)temp);
-                    all_worked = false;
-#pragma omp flush (all_worked)
-                }
-            }
-        }
-    }
-    return all_worked;
-}
-
-    template<>
-int SparseTriangularSolve<float>::VerifyResults(int iteration)
-{
-    int errors = 0;
-
-    #pragma omp parallel for
-    for (int i = 0; i < nRows; i++)
-    {
-        int assigned_pe = (i / SPTS_BLOCK_SIZE) % this->Get_total_pes();
-        if (this->Get_this_pe() == assigned_pe) {
-            float compare_val = fabs(yref[i]*1e-3);
-            if (compare_val < 10*FLT_EPSILON)
-                compare_val = 10*FLT_EPSILON;
-            if ((yref[i] - compare_val) > y[i] || (yref[i] + compare_val) < y[i])
-            {
-                #pragma omp critical
-                {
-                    if(errors == 0)
-                    {
-                        fflush(stdout);
-                        fprintf(stderr, "\nDetected some differences between CPU and GPU results on iteration %d...", iteration);
-                    }
-                    fprintf(stderr, "%d GPU CALCULATION ERROR on row %d\n", this->Get_this_pe(), i);
-                    fprintf(stderr, "\tCPU value for y[%d]: %.*e\n", i, OP_FLT_Digs-1, yref[i]);
-                    fprintf(stderr, "\tGPU value for y[%d]: %.*e\n", i, OP_FLT_Digs-1, y[i]);
-                    errors += 1;
-                    observed_errors[iteration].insert(std::pair<int, float> (i, y[i]));
-                }
-            }
-        }
-    }
-    return errors;
-}
-
-    template<>
-int SparseTriangularSolve<double>::VerifyResults(int iteration)
-{
-    int errors = 0;
-    #pragma omp parallel for
-    for (int i = 0; i < nRows; i++)
-    {
-        double compare_val = fabs(yref[i]*1e-4);
-        if (compare_val < 10*DBL_EPSILON)
-            compare_val = 10*DBL_EPSILON;
-        if ((yref[i] - compare_val) > y[i] || (yref[i] + compare_val) < y[i])
-        {
-            #pragma omp critical
-            {
-                if(errors == 0)
-                {
-                    fflush(stdout);
-                    fprintf(stderr, "\nDetected differences between CPU and GPU results on iteration %d...", iteration);
-                }
-                fprintf(stderr, "GPU CALCULATION ERROR on row %d\n", i);
-                fprintf(stderr, "\tCPU value for y[%d]: %.*e\n", i, OP_DBL_Digs-1, yref[i]);
-                fprintf(stderr, "\tGPU value for y[%d]: %.*e\n", i, OP_DBL_Digs-1, y[i]);
-                errors += 1;
-                observed_errors[iteration].insert(std::pair<int, double> (i, y[i]));
-            }
-        }
-    }
-    return errors;
-}
-
-template<typename FloatType>
-int SparseTriangularSolve<FloatType>::NonDeterministicErrors()
-{
-    int iter = InputFlags::GetValueInt("iterations");
-    int non_det_errors = 0;
-#ifdef ALL_SYNCFREE
-    for (int i = 1; i < iter; i++)
-    {
-        if (errors_seen[i] != errors_seen[0])
-        {
-            non_det_errors++;
-            if (non_det_errors == 1)
-            {
-                fprintf(stderr, "Different SpTS iterations saw different error counts -- non-deterministic bug possible.\n");
-                fprintf(stderr, "\te.g. saw %d errors during iteration 0. Saw %d errors during iteration %i\n", errors_seen[0], errors_seen[i], i);
-            }
-        }
-        else if (observed_errors[i] != observed_errors[0])
-        {
-            non_det_errors++;
-            if (non_det_errors == 1)
-            {
-                fprintf(stderr, "ERRORS were seen. Different iterations saw errors on different rows -- non-deterministic bug possible.\n");
-                fprintf(stderr, "\te.g. Iterations 0 and %d were different.\n", i);
-            }
-        }
-    }
-#else
-    if (iter >= 1)
-    {
-        if (errors_seen[0] != errors_seen[1])
-        {
-            non_det_errors++;
-            fprintf(stderr, "Different SpTS iterations saw different error counts -- non-deterministic bug possible.\n");
-            fprintf(stderr, "\te.g. saw %d errors during iteration 0. Saw %d errors during iteration %i\n", errors_seen[0], errors_seen[1], 1);
-        }
-    }
-    for (int i = 2; i < iter; i++)
-    {
-        if (errors_seen[i] != errors_seen[1])
-        {
-            non_det_errors++;
-            if (non_det_errors == 1)
-            {
-                fprintf(stderr, "Different SpTS iterations saw different error counts -- non-deterministic bug possible.\n");
-                fprintf(stderr, "\te.g. saw %d errors during iteration 1. Saw %d errors during iteration %i\n", errors_seen[1], errors_seen[i], i);
-            }
-        }
-        else if (observed_errors[i] != observed_errors[1])
-        {
-            non_det_errors++;
-            if (non_det_errors == 1)
-            {
-                fprintf(stderr, "ERRORS were seen. Different iterations saw errors on different rows -- non-deterministic bug possible.\n");
-                fprintf(stderr, "\te.g. Iterations 1 and %d were different.\n", i);
-            }
-        }
-    }
-#endif
-    return non_det_errors;
-}
-
-template<typename FloatType>
-int SparseTriangularSolve<FloatType>::MaxErrors()
-{
-    int iter = InputFlags::GetValueInt("iterations");
-    int max_errors = 0;
-    for (int i = 0; i < iter; i++)
-    {
-        if (errors_seen[i] > max_errors)
-            max_errors = errors_seen[i];
-    }
-    return max_errors;
-}
-
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
-static inline unsigned int flp2(unsigned int x)
-{
-    x |= (x >> 1);
-    x |= (x >> 2);
-    x |= (x >> 4);
-    x |= (x >> 8);
-    x |= (x >> 16);
-    return x - (x >> 1);
-}
-
-// Short rows in CSR-Adaptive are batched together into a single row block.
-// If there are a relatively small number of these, then we choose to do
-// a horizontal reduction (groups of threads all reduce the same row).
-// If there are many threads (e.g. more threads than the maximum size
-// of our workgroup) then we choose to have each thread serially reduce
-// the row.
-// This function calculates the number of threads that could team up
-// to reduce these groups of rows. For instance, if you have a
-// workgroup size of 256 and 4 rows, you could have 64 threads
-// working on each row. If you have 5 rows, only 32 threads could
-// reliably work on each row because our reduction assumes power-of-2.
-    template< typename rowBlockType >
-static inline rowBlockType numThreadsForReduction(const rowBlockType num_rows)
-{
-#if defined(__INTEL_COMPILER)
-    return 256 >> (_bit_scan_reverse(num_rows-1)+1);
-#elif (defined(__clang__) && __has_builtin(__builtin_clz)) || \
-    !defined(__clang) && \
-    defined(__GNUG__) && ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202)
-    return (256 >> (8*sizeof(int)-__builtin_clz(num_rows-1)));
-#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
-    unsigned long bit_returned;
-    _BitScanReverse(&bit_returned, (num_rows-1));
-    return 256 >> (bit_returned+1);
-#else
-    return flp2(256/num_rows);
-#endif
-}
-
-    template<typename FloatType>
-int SparseTriangularSolve<FloatType>::ComputeRowBlocks(std::vector<uint64_t> &rowBlocks,
-        int *rowDelimiters,
-        int nRows)
-{
-    rowBlocks.erase(rowBlocks.begin(), rowBlocks.end());
-    rowBlocks.push_back(0);
-    uint64_t sum = 0;
-    uint64_t i, last_i = 0;
-
-    // Check to ensure nRows can fit in 32 bits
-    if ((uint64_t) nRows > (uint64_t)pow(2, ROW_BITS))
-    {
-        fflush(stdout);
-        fprintf(stderr, "\nNumber of Rows in the Sparse Matrix is greater than what is supported at present (%d bits) !", ROW_BITS );
-        exit(0);
-    }
-
-    int consecutive_long_rows = 0;
-    for(i = 1; i <= nRows; i++)
-    {
-        int row_length = ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] );
-        sum += row_length;
-
-        // The following section of code calculates whether you're moving between
-        // a series of "short" rows and a series of "long" rows.
-        // This is because the reduction in CSR-Adaptive likes things to be
-        // roughly the same length. Long rows can be reduced horizontally.
-        // Short rows can be reduced one-thread-per-row. Try not to mix them.
-        if ( row_length > 128 )
-            consecutive_long_rows++;
-        else if ( consecutive_long_rows > 0 )
-        {
-            // If it turns out we WERE in a long-row region, cut if off now.
-            if (row_length < 32) // Now we're in a short-row region
-                consecutive_long_rows = -1;
-            else
-                consecutive_long_rows++;
-        }
-
-        // If you just entered into a "long" row from a series of short rows,
-        // then we need to make sure we cut off those short rows. Put them in
-        // their own workgroup.
-        if ( consecutive_long_rows == 1 )
-        {
-            // Assuming there *was* a previous workgroup. If not, nothing to do here.
-            if( i - last_i > 1 )
-            {
-                rowBlocks.push_back( (i - 1) << (64 - ROW_BITS) );
-                // If this row fits into CSR-Stream, calculate how many rows
-                // can be used to do a parallel reduction.
-                // Fill in the low-order bits with the numThreadsForRed
-                if (((i-1) - last_i) > 2)
-                    rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction((i - 1) - last_i);
-
-                last_i = i-1;
-                sum = row_length;
-            }
-        }
-        else if (consecutive_long_rows == -1)
-        {
-            // We see the first short row after some long ones that
-            // didn't previously fill up a row block.
-            rowBlocks.push_back( (i - 1) << (64 - ROW_BITS) );
-            if (((i-1) - last_i) > 2)
-                rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction((i - 1) - last_i);
-
-            last_i = i-1;
-            sum = row_length;
-            consecutive_long_rows = 0;
-        }
-
-        // Now, what's up with this row? What did it do?
-
-        // exactly one row results in non-zero elements to be greater than blockSize
-        // This is csr-vector case; bottom WG_BITS == workgroup ID
-        if( ( i - last_i == 1 ) && sum > 1024 )
-        {
-            int numWGReq = ceil( (double)row_length / (1024) );
-
-            // Check to ensure #workgroups can fit in WG_BITS bits, if not
-            // then the last workgroup will do all the remaining work
-            numWGReq = ( numWGReq < (int)pow( 2, WG_BITS ) ) ? numWGReq : (int)pow( 2, WG_BITS );
-
-            for( int w = 1; w < numWGReq; w++ )
-            {
-                rowBlocks.push_back((i-1) << ROW_BITS);
-                rowBlocks[rowBlocks.size() - 1] |= (uint64_t)w;
-            }
-            rowBlocks.push_back(i << ROW_BITS);
-
-            last_i = i;
-            sum = 0;
-            consecutive_long_rows = 0;
-        }
-        // more than one row results in non-zero elements to be greater than blockSize
-        // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
-        else if( ( i - last_i > 1 ) && sum > 1024 )
-        {
-            i--; // This row won't fit, so back off one.
-            rowBlocks.push_back( i << (64 - ROW_BITS) );
-            if ((i - last_i) > 2)
-                rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction(i - last_i);
-            last_i = i;
-            sum = 0;
-            consecutive_long_rows = 0;
-        }
-        // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
-        else if( sum == 1024 )
-        {
-            rowBlocks.push_back( i << (64 - ROW_BITS) );
-            if ((i - last_i) > 2)
-                rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction(i - last_i);
-            last_i = i;
-            sum = 0;
-            consecutive_long_rows = 0;
-        }
-    }
-
-    // If we didn't fill a row block with the last row, make sure we don't lose it.
-    if ( (rowBlocks[rowBlocks.size() - 2] >> (64 - ROW_BITS)) != (uint64_t)(nRows) )
-    {
-        rowBlocks.push_back( (uint64_t)( nRows ) << (64 - ROW_BITS) );
-        if ((nRows - last_i) > 2)
-            rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction(i - last_i);
-    }
-
-    return rowBlocks.size();
-}
-
-    template<typename FloatType>
-float SparseTriangularSolve<FloatType>::CSRSpTSGPU(uint64_t &ns_per_iter, uint64_t &ns_per_analysis_iter, uint64_t &ns_per_syncfree_iter, uint64_t &ns_per_levelset_iter, uint64_t &ns_per_levelsync_iter, FloatType alpha)
-{
-    gpuInt status;
-    gpuEvent* event_array;
-    #ifdef USE_HIP
-    hipSetDevice(this->Get_this_pe());
-    hipDeviceProp_t props;
-    hipGetDeviceProperties(&props, this->Get_this_pe());
-    printf("\nPE %d: PCIe BUS ID %d DEV ID %d\n", this->Get_this_pe(), props.pciBusID, props.pciDeviceID);
-    event_array = (gpuEvent*)malloc(sizeof(gpuEvent) * 2);
-    hipEventCreate(&event_array[0]);
-    hipEventCreate(&event_array[1]);
-    #else
-    event_array = (gpuEvent*)malloc(sizeof(gpuEvent));
-    #endif
-    size_t global_work_size;
-    size_t local_work_size = WF_PER_WG * WF_SIZE;
-
-
-    /*************************** Setup and create buffers ********************/
-    /****** Matrix Setup Code ******/
-    /* Get the OpenCL buffers for the input matrix */
-    memPointer bufNonZeroes = SparseMatrix<FloatType>::GetDevVals();
-    memPointer bufColumnIndices = SparseMatrix<FloatType>::GetDevCols();
-    memPointer bufRowPtrs = SparseMatrix<FloatType>::GetDevRowPtrs();
-    /* Get the host buffers for the input matrix */
-    FloatType *Avalues = SparseMatrix<FloatType>::GetVals();
-    int *Acols = SparseMatrix<FloatType>::GetCols();
-    int *rowptrs = SparseMatrix<FloatType>::GetRowPtrs();
-
-
-    /****** Adaptive RowBlocks Setup Code ******/
-    numBlocks = ComputeRowBlocks(rowBlocks, rowptrs, nRows);
-    rowBlocksDev = this->GPU->AllocateMem("rowBlocks", numBlocks*sizeof(int64_t), GPU_MEM_READ_WRITE, NULL);
-    uint64_t completedRows = 0;
-    completedRowsDev = this->GPU->AllocateMem("completedRows", sizeof(uint64_t), GPU_MEM_READ_WRITE|GPU_MEM_USE_HOST_PTR, &completedRows);
-
-    /****** SpTS Meta-Data Setup Code ******/
-    /* Set up the OpenCL buffers for the SpTS meta-data */
-    // TODO -- is this +1 in doneArray nRows+1 required? Why?
-    #ifdef USE_ROCSHMEM
-    doneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
-    reqUpdateArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
-    shadowDoneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t));
-    #else
-    doneArrayDev = this->GPU->AllocateMem("doneArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
-    reqUpdateArrayDev = this->GPU->AllocateMem("reqUpdateArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
-    shadowDoneArrayDev = this->GPU->AllocateMem("shadowDoneArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
-    #endif
-    remoteInProgressArrayDev = this->GPU->AllocateMem("remoteInProgressArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
-    numRowsAtLevelDev = this->GPU->AllocateMem("numRowsAtLevel", (nRows)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
-    rowMapDev = this->GPU->AllocateMem("rowMap", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_ONLY, NULL);
-    maxDepthDev = this->GPU->AllocateMem("maxDepth", sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
-    totalSpinDev = this->GPU->AllocateMem("totalSpin", sizeof(uint64_t), GPU_MEM_READ_WRITE, NULL);
-    oneBufDev = this->GPU->AllocateMem("oneBuf", sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL);
-    /* Set up the host buffers for the SpTS meta-data */
-    uint32_t *doneArray = (uint32_t*)calloc((nRows+1), sizeof(uint32_t));
-    uint32_t *numRowsAtLevel = (uint32_t*)calloc(nRows, sizeof(uint32_t));
-    uint32_t *rowMap = (uint32_t*)calloc((nRows+1), sizeof(uint32_t));
-    uint32_t maxDepth = 0;
-    uint64_t totalSpin = 0;
-
-    uint32_t *nrows_plus1_zero = (uint32_t*)calloc((nRows+1), sizeof(uint32_t));
-    uint64_t u64_zero = 0;
-    uint32_t u32_zero = 0;
-
-    //uint32_t uns_int_one = 0x42280000;
-    uint32_t u32_one = 1;
-
-    // TODO:  Gather and flatten out Avalues, Acols, and rowptrs based on
-    // row cyclic decomposition.  For now, we just copy the hole vals, cols,
-    // and row_ptrs matrix, even though we really only access 1/num_pes of the
-    // whole thing.  We can do some more sophisticated stuff here if we run out
-    // of space on the GPU or we don't like the copy overheads of the initial
-    // buffers.
-
-    /************************ Copy initial buffers to device *****************/
-    /****** Copy matrix ******/
-    this->GPU->CopyToDevice(bufNonZeroes, Avalues, this->nNZ*sizeof(FloatType), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(bufColumnIndices, Acols, this->nNZ*sizeof(int), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(bufRowPtrs, rowptrs, (this->nRows+1)*sizeof(int), 0, GPU_TRUE, NULL);
-
-    /****** Copy vectors ******/
-    this->GPU->CopyToDevice(xDev, x, nCols*sizeof(FloatType), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(yDev, y_zero, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL);
-
-    /****** Copy adaptive rowBlock information ******/
-    this->GPU->CopyToDevice(rowBlocksDev, rowBlocks.data(), numBlocks*sizeof(int64_t), 0, GPU_TRUE, NULL);
-
-    /****** Copy SpTS meta-data needed for analyze_and_solve run ******/
-    this->GPU->CopyToDevice(doneArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(shadowDoneArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(reqUpdateArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(remoteInProgressArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(numRowsAtLevelDev, nrows_plus1_zero, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(maxDepthDev, &u32_zero, sizeof(uint32_t), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(totalSpinDev, &u64_zero, sizeof(uint64_t), 0, GPU_TRUE, NULL);
-    this->GPU->CopyToDevice(oneBufDev, &u32_one, sizeof(uint32_t), 0, GPU_TRUE, NULL);
-
-    /************************** Set up iteration printing ********************/
-    /* We want to print, ideally, every iteration that gets up 10% closer to
-     * completion. This sets that up */
-    int iter = InputFlags::GetValueInt("iterations");
-    double print_iter = (float)iter / 10.;
-    if (print_iter < 1.)
-        print_iter = 1.;
-    double next_to_print = 0.;
-
-
-    /**************************** Set up perf analysis ************************/
-    // For performance analysis, keep track of how much time we've spent doing
-    // kernel work.
-    // TODO -- Take in from the command line whether to get kernel or total time.
-    // If doing total time, try launching all of the kernels at once and waiting
-    // outside. This will apparently reduce the overheads.
-    uint64_t total_kern_time = 0;
-    uint64_t analyze_kern_time = 0;
-    double analyze_kern_flops = 0.;
-    uint64_t syncfree_kern_time = 0;
-    uint64_t levelset_kern_time = 0;
-    uint64_t levelsync_kern_time = 0;
-
-    errors_seen = new int[iter];
-
-    int analysis_iter = 0;
-    int syncfree_iter = 0;
-    int levelset_iter = 0;
-    int levelsync_iter = 0;
-
-    int level_sync_cutoff = InputFlags::GetValueInt("levelsync_size");
-    bool syncfree_better = false;
-
-    int total_workitems_per_workgroup = WF_SIZE * WF_PER_WG;
-    //bool rocshmem_initialized = false;
-
-    /*********************** Actual work of the benchmark *********************/
-    for(int i = 0; i < iter; i++)
-    {
-        if (i == (int)next_to_print || i == (iter - 1))
-        {
-            printf("%d..", i+1);fflush(stdout);
-            next_to_print += print_iter;
-        }
-
-#ifndef ALL_SYNCFREE
-#ifdef ALL_ANALYZE
-        // When we only want to run the analyze-and-solve mechanism, rather than
-        // the more optimized syncfree algorithm, we always go into here.
-        if (1)
-#else
-        // In any version of the program that has the possibility of running the
-        // level-set algorithm, we need to start with the syncfree-and-analyze
-        // version of the program, so that we can set up the potential to run the
-        // level-set algorithm. This will take place on the first iteration.
-        if (i == 0)
-#endif
-        {
-            analysis_iter++;
-            global_work_size = nRows * WF_SIZE;
-            #ifndef USE_HIP
-            CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-            CL->SetArgs(CLHelper::SpTSKernel_analyze, 0,
-                    bufNonZeroes,
-                    bufColumnIndices,
-                    bufRowPtrs,
-                    xDev,
-                    yDev,
-                    alpha,
-                    doneArrayDev,
-                    numRowsAtLevelDev,
-                    maxDepthDev,
-                    totalSpinDev);
-
-            status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_analyze, 1, NULL, &global_work_size, NULL, 0, NULL, &event_array[0]);
-            CL->checkStatus(status,"clEnqueueNDRangeKernel failed");
-            this->GPU->Flush();
-            total_kern_time += CL->ComputeTime(event_array[0]);
-            analyze_kern_time += CL->ComputeTime(event_array[0]);
-            #else
-            int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                    / total_workitems_per_workgroup;
-            #ifdef USE_ROCSHMEM
-            global_work_size = this->nRows_p * WF_SIZE;
-            num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                 / total_workitems_per_workgroup;
-	   /*
-	    int rocshmem_queues = (2560 / WF_PER_WG);
-	    if (2560 % WF_PER_WG)
-		rocshmem_queues++;
-            if (!rocshmem_initialized) {
-            	int num_threads = InputFlags::GetValueInt("num_roshmem_threads");
-                rocshmem_init(&handle, rocshmem_queues);
-                rocshmem_initialized = true;
-            }
-		*/
-            int rocshmem_algorithm = InputFlags::GetValueInt("rocshmem_algorithm");
-			int rocshmem_put_block_size = InputFlags::GetValueInt("put_block_size");
-			int rocshmem_get_backoff_factor = InputFlags::GetValueInt("get_backoff_factor");
-	    switch (rocshmem_algorithm) {
-		case 0:
-                	printf("Using Put-based intra-kernel algorithm\n");
-			break;
-		case 1:
-                	printf("Using Get-based intra-kernel algorithm (Backoff factor %d)\n", rocshmem_get_backoff_factor);
-			break;
-		case 2:
-                	printf("Using blocked Put-based intra-kernel algorithm\n");
-					printf("Using blocked Put-based intra-kernel algorithm (Block Size %d)\n", rocshmem_put_block_size);
-			break;
-		case 3:
-                	printf("Using put/get hybrid intra-kernel algorithm\n");
-			break;
-		default:
-			printf("Unknown rocSHMEM algorithm\n");
-			exit(-1);
-	   }
-            size_t LDS_size;
-            rocshmem_dynamic_shared(&LDS_size);
-            printf("Work size %zu, wg size %d num workgroups %d  LDS %zu  thisPE %d  Global %d \n", global_work_size, total_workitems_per_workgroup, num_of_workgroups, LDS_size,  this->Get_this_pe(), this->Get_total_pes());
-            MPI_Barrier(MPI_COMM_WORLD);
-            hipEventRecord(event_array[0], NULL);
-            hipLaunchKernelGGL(amd_spts_analyze_and_solve,
-                    dim3(num_of_workgroups),
-                    dim3(total_workitems_per_workgroup),
-                    LDS_size, 0,
-                    global_work_size,
-                    this->Get_this_pe(),
-                    this->Get_total_pes(),
-                    static_cast<unsigned int *>(shadowDoneArrayDev),
-                    static_cast<unsigned int *>(reqUpdateArrayDev),
-                    static_cast<unsigned int *>(remoteInProgressArrayDev),
-                    static_cast<unsigned int *>(oneBufDev),
-                    rocshmem_algorithm,
-					rocshmem_put_block_size,
-					rocshmem_get_backoff_factor,
-                    SPTS_BLOCK_SIZE,
-                    static_cast<FPTYPE *>(bufNonZeroes),
-                    static_cast<int *>(bufColumnIndices),
-                    static_cast<int *>(bufRowPtrs),
-                    static_cast<FPTYPE *>(xDev),
-                    static_cast<FPTYPE *>(yDev),
-                    alpha,
-                    static_cast<unsigned int *>(doneArrayDev),
-                    static_cast<unsigned int *>(numRowsAtLevelDev),
-                    static_cast<unsigned int *>(maxDepthDev),
-                    static_cast<unsigned long long *>(totalSpinDev));
-            #else
-            hipEventRecord(event_array[0], NULL);
-            hipLaunchKernelGGL(amd_spts_analyze_and_solve,
-                    dim3(num_of_workgroups),
-                    dim3(total_workitems_per_workgroup),
-                    0, 0,
-                    global_work_size,
-                    static_cast<FPTYPE *>(bufNonZeroes),
-                    static_cast<int *>(bufColumnIndices),
-                    static_cast<int *>(bufRowPtrs),
-                    static_cast<FPTYPE *>(xDev),
-                    static_cast<FPTYPE *>(yDev),
-                    alpha,
-                    static_cast<unsigned int *>(doneArrayDev),
-                    static_cast<unsigned int *>(numRowsAtLevelDev),
-                    static_cast<unsigned int *>(maxDepthDev),
-                    static_cast<unsigned long long *>(totalSpinDev));
-            #endif
-            hipEventRecord(event_array[1], NULL);
-            hipEventSynchronize(event_array[1]);
-
-            #ifdef USE_ROCSHMEM
-            // Wait for any outstanding network messages to finish up.  We
-            // can have straggler updates to the doneArray that we don't
-            // have any dependencies for but we still eed it to finish so
-            // the below statistics can work correctly.
-            //ro_shmem_dump_stats(handle);
-            //ro_shmem_reset_stats(handle);
-			//sleep(10);
-			/* if( this->Get_this_pe() == 0 && (this->Get_total_pes() > 1)){
-			 	PRINT_SQ(get_rtn_handle(handle), 0, 1, 0);
-			 	PRINT_CQ(get_rtn_handle(handle), 0, 1, 0);
-			 	PRINT_SQ(get_rtn_handle(handle), 0, 1, 1);
-			 	PRINT_CQ(get_rtn_handle(handle), 0, 1, 1);
-			 	PRINT_SQ(get_rtn_handle(handle), 0, 1, 2);
-			 	PRINT_CQ(get_rtn_handle(handle), 0, 1, 2);
-				}*/
-            MPI_Barrier(MPI_COMM_WORLD);
-            #endif
-            float elapsed;
-            hipEventElapsedTime(&elapsed, event_array[0], event_array[1]);
-            total_kern_time += elapsed * 1000000;
-            analyze_kern_time += elapsed * 1000000;
-            #endif
-            analyze_kern_flops = (2 * (double)nNZ * 1000000000.) / (double)analyze_kern_time;
-            this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToHost(maxDepthDev, &maxDepth, sizeof(uint32_t), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToHost(doneArrayDev, doneArray, (nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToHost(totalSpinDev, &totalSpin, sizeof(uint64_t), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToHost(numRowsAtLevelDev, numRowsAtLevel, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-            this->GPU->Flush();
-
-            #ifdef USE_ROCSHMEM
-            // Combine global statistics
-            MPI_Allreduce(MPI_IN_PLACE, (void *) &maxDepth, 1, MPI_UNSIGNED, MPI_MAX, MPI_COMM_WORLD);
-            MPI_Allreduce(MPI_IN_PLACE, (void *) &totalSpin, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-
-            // TODO: Broadcast out the doneArray and yDev values to all nodes.  This
-            // is needed for the below calculations to work since in the 'pull'
-            // distributed model we don't request data for rows that we don't
-            // have a dependency on.
-            #endif
-
-            bool verify = InputFlags::GetValueBool("verify");
-            if (verify) {
-                printf("Performing results verification\n");
-                errors_seen[i] = VerifyResults(i);
-            }
-            printf("\nTotalSpin: %lu\n", totalSpin);
-
-            /* Prefix sum of the number of rows at each level, so that we can
-             * calculate how much to offset each level into the rowMap */
-            // TODO -- Do this prefix sum on the GPU while copying maxDepth and
-            // doneArray back into the host. Set non-blocking on the previous ones.
-            this->GPU->CopyToHost(numRowsAtLevelDev, numRowsAtLevel, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-            for (unsigned int joe = 1; joe < maxDepth; joe++)
-                numRowsAtLevel[joe] = numRowsAtLevel[joe] + numRowsAtLevel[joe-1];
-
-            /* Build up the rowMap so that each iteration of the no-wait solve
-             * knows what it's global_id->row mapping is.
-             * The general mechanism for this is as follows:
-             * doneArray[row] holds the level that a particular row is in.
-             *
-             * We know the total number of levels needed (maxDepth), so rowMap
-             * has maxDepth 'buckets'.
-             *
-             * The numRowsAtLevel array (after the above prefix-sum) tells us
-             * how many values are in all of the previous buckets, so that
-             * we can get an appropriate array offset for each bucket.
-             *
-             * The counters array keeps track of how many items are in each
-             * bucket so far. Add this to the numRowsAtLevel[] offset.
-             *
-             * As we walk through all the rows, we check to see which level's
-             * bucket we should put this row in. Add it at the end of the
-             * current bucket, then increment the counter. */
-            uint32_t *counters = (uint32_t *)calloc(maxDepth, sizeof(uint32_t));
-/*            for (unsigned int this_row = 0; this_row < nRows; this_row++)
-            {
-                // We must subtract one here, because the first level is '1'
-                // The GPU kernel does that because a value of '0' means
-                // 'not done, keep waiting' in the analysis kernel.
-                assert(doneArray[this_row] != 0);
-                unsigned int this_rows_level = doneArray[this_row] - 1;
-                unsigned int previous_level = this_rows_level - 1;
-                unsigned int depth_offset;
-                if (this_rows_level == 0) // can't check previous level
-                    depth_offset = 0;
-                else
-                    depth_offset = numRowsAtLevel[previous_level];
-                rowMap[depth_offset + counters[this_rows_level]] = this_row;
-                counters[this_rows_level] += 1;
-            } */
-            free(counters);
-            this->GPU->CopyToDevice(rowMapDev, rowMap, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL);
-            free(event_array);
-            #ifdef USE_HIP
-            event_array = (gpuEvent*)malloc(maxDepth * sizeof(gpuEvent) * 2);
-            for (int i = 0; i < maxDepth * 2; i++)
-                hipEventCreate(&event_array[i]);
-            #else
-            event_array = (gpuEvent*)malloc(maxDepth * sizeof(gpuEvent));
-            #endif
-#ifdef ALL_ANALYZE
-            // We will be coming back into this kernel. Time to reset its data.
-            if (i != (iter - 1))
-            {
-                this->GPU->CopyToDevice(maxDepthDev, &u32_zero, sizeof(uint32_t), 0, GPU_FALSE, NULL);
-                this->GPU->CopyToDevice(totalSpinDev, &u64_zero, sizeof(uint64_t), 0, GPU_FALSE, NULL);
-                this->GPU->CopyToDevice(numRowsAtLevelDev, nrows_plus1_zero, nRows*sizeof(uint32_t), 0, GPU_FALSE, NULL);
-            }
-#endif
-            this->GPU->CopyToDevice(yDev, y_zero, nRows*sizeof(FloatType), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToDevice(doneArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToDevice(shadowDoneArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToDevice(remoteInProgressArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL);
-    	    this->GPU->CopyToDevice(reqUpdateArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL);
-            this->GPU->Flush();
-            // Either we always want to run just this function block
-            // (ALL_ANALYZE), or the first iteration is the analyze-and-solve
-            // kernel. Either way, don't continue to the code below this time.
-            continue;
-    }
-#endif
-    // If ALL_SYNCFREE is defined, we always run the amd_spts_syncfree_solve
-    // kernel. We never try to speed it up by paying attention to the output
-    // levels and running the levelset kernel.
-    // If ALL_LEVELSET is set, we only run the analysis kernel up above to get the
-    // level-set and do the first solve; after that we skip the
-    // amd_spts_analyze_and_solve kernel and do the level-set based solve.
-    // Otherwise, we dynamically choose between those kernels based on some
-    // statistics that we gathered during the analyze-and-solve run.
-#ifdef ALL_SYNCFREE
-        if (1) // always run syncfree algorithm
-#elif defined(ALL_LEVELSET) || defined(ALL_LEVELSYNC)
-        if (0) // always *do not* run syncfree algorithm
-#else
-        if (totalSpin == 0 || analyze_kern_flops/totalSpin > 25000 || syncfree_better) // Try to run syncfree
-#endif
-        {
-            syncfree_iter++;
-            // TODO -- Eventually get this working with numer of RowBlocks - 1
-            global_work_size = nRows * WF_SIZE;
-
-            uint32_t current_iteration = 0;
-
-            #ifdef USE_ROCSHMEM
-            fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
-            exit(-1);
-            #endif
-
-            #ifndef USE_HIP
-            CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-            CL->SetArgs(CLHelper::SpTSKernel, 0,
-                    bufNonZeroes,
-                    bufColumnIndices,
-                    bufRowPtrs,
-                    xDev,
-                    yDev,
-                    alpha,
-                    doneArrayDev,
-                    numRowsAtLevelDev,
-                    maxDepthDev,
-                    totalSpinDev);
-
-            status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel, 1, NULL, &global_work_size, NULL, 0, NULL, &event_array[0]);
-            CL->checkStatus(status,"clEnqueueNDRangeKernel failed");
-            current_iteration++;
-            this->GPU->Flush();
-            total_kern_time += CL->ComputeTime(event_array[0]);
-            syncfree_kern_time += CL->ComputeTime(event_array[0]);
-            #else
-            int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                    / total_workitems_per_workgroup;
-            hipEventRecord(event_array[0], NULL);
-            hipLaunchKernelGGL(amd_spts_syncfree_solve,
-                    dim3(num_of_workgroups),
-                    dim3(total_workitems_per_workgroup),
-                    0, 0,
-                    global_work_size,
-                    static_cast<FPTYPE *>(bufNonZeroes),
-                    static_cast<int *>(bufColumnIndices),
-                    static_cast<int *>(bufRowPtrs),
-                    static_cast<FPTYPE *>(xDev),
-                    static_cast<FPTYPE *>(yDev),
-                    alpha,
-                    static_cast<unsigned int *>(doneArrayDev),
-                    static_cast<unsigned int *>(numRowsAtLevelDev),
-                    static_cast<unsigned int *>(maxDepthDev),
-                    static_cast<unsigned long long *>(totalSpinDev));
-            hipEventRecord(event_array[1], NULL);
-            hipEventSynchronize(event_array[1]);
-            current_iteration++;
-            float elapsed;
-            hipEventElapsedTime(&elapsed, event_array[0], event_array[1]);
-            total_kern_time += elapsed * 1000000;
-            syncfree_kern_time += elapsed * 1000000;
-
-            #endif
-
-            this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL);
-            errors_seen[i] = VerifyResults(i);
-
-            this->GPU->Flush();
-            current_iteration = 0;
-            completedRows = 0;
-        }
-#if defined(ALL_LEVELSYNC)
-        else if (1) // always run levelset+syncfree combination
-#elif defined (ALL_LEVELSET)
-        else if (0) // Fall through to level-set
-#else
-        else if (1) // always run levelset+syncfree, never fall through to level-set only
-#endif
-        {
-            // This is the "level-sync" algorithm, where we take the level-set
-            // information and launch kernels that combine multiple levels
-            // together. This allows us to find parallelism to run on the GPU,
-            // even if technically there are some data dependencies between the
-            // levels. Within the kernel, we use the synchronization-free algorithm
-            // to ensure that we get the right answer.
-            // This algorithm reduces the spin-loop overhead of the sync-free
-            // algorithm if there are many levels, but it finds more parallelism
-            // than the pure level-set algorithms which can only run on one CU.
-            levelsync_iter++;
-
-            // Keep track of total kernels we launch so we can watch for events.
-            int total_enqueues = 0;
-
-            /* The rowMap tells each workgroup within the kernel what
-             * rows it is working on. However, each each kernel invocation
-             * is working on a different level. Each level is in a separate
-             * 'bucket' in the rowMap. We must tell each invocation how far
-             * into the rowMap it much index. That's the depth_offset.
-             * numRowsAtLevel (after the above prefix-sum) tells us how
-             * many rows were in all previous levels combined. */
-            unsigned int depth_offset = 0;
-            unsigned int running_total = 0; // How many rows in this launch
-
-            if (level_sync_cutoff == 0)
-            {
-                if (nRows/maxDepth < 32)
-                    level_sync_cutoff = 2560;
-                else
-                    level_sync_cutoff = 81920;
-            }
-
-            #ifdef USE_ROCSHMEM
-            fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
-            exit(-1);
-            #endif
-
-            for (int this_level = 0; this_level < maxDepth; this_level++)
-            {
-                if (this_level != 0 && running_total == 0)
-                    depth_offset = numRowsAtLevel[this_level-1];
-
-                running_total = numRowsAtLevel[this_level] - depth_offset;
-
-                if (running_total >= level_sync_cutoff)
-                {
-                    global_work_size = (running_total + (running_total % WF_PER_WG)) * WF_SIZE;
-                    #ifndef USE_HIP
-                    CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-                    CL->SetArgs(CLHelper::SpTSKernel_levelsync, 0,
-                            bufNonZeroes,
-                            bufColumnIndices,
-                            bufRowPtrs,
-                            xDev,
-                            yDev,
-                            alpha,
-                            doneArrayDev,
-                            rowMapDev,
-                            depth_offset);
-                    status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_levelsync, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &event_array[total_enqueues]);
-                    this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed");
-                    #else
-                    int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                            / total_workitems_per_workgroup;
-                    hipEventRecord(event_array[total_enqueues * 2], NULL);
-                    hipLaunchKernelGGL(amd_spts_levelsync_solve,
-                            dim3(num_of_workgroups),
-                            dim3(total_workitems_per_workgroup),
-                            0, 0,
-                            global_work_size,
-                            static_cast<FPTYPE *>(bufNonZeroes),
-                            static_cast<int *>(bufColumnIndices),
-                            static_cast<int *>(bufRowPtrs),
-                            static_cast<FPTYPE *>(xDev),
-                            static_cast<FPTYPE *>(yDev),
-                            alpha,
-                            static_cast<unsigned int *>(doneArrayDev),
-                            static_cast<unsigned int*>(rowMapDev),
-                            depth_offset);
-                    hipEventRecord(event_array[total_enqueues * 2 + 1], NULL);
-                    #endif
-                    total_enqueues++;
-                    running_total = 0;
-                }
-            }
-            if (running_total)
-            {
-                global_work_size = (running_total + (running_total % WF_PER_WG)) * WF_SIZE;
-                #ifndef USE_HIP
-                CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-                CL->SetArgs(CLHelper::SpTSKernel_levelsync, 0,
-                        bufNonZeroes,
-                        bufColumnIndices,
-                        bufRowPtrs,
-                        xDev,
-                        yDev,
-                        alpha,
-                        doneArrayDev,
-                        rowMapDev,
-                        depth_offset);
-                status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_levelsync, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &event_array[total_enqueues]);
-                this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed");
-                #else
-                int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                        / total_workitems_per_workgroup;
-                hipEventRecord(event_array[total_enqueues * 2], NULL);
-                hipLaunchKernelGGL(amd_spts_levelsync_solve,
-                        dim3(num_of_workgroups),
-                        dim3(total_workitems_per_workgroup),
-                        0, 0,
-                        global_work_size,
-                        static_cast<FPTYPE *>(bufNonZeroes),
-                        static_cast<int *>(bufColumnIndices),
-                        static_cast<int *>(bufRowPtrs),
-                        static_cast<FPTYPE *>(xDev),
-                        static_cast<FPTYPE *>(yDev),
-                        alpha,
-                        static_cast<unsigned int *>(doneArrayDev),
-                        static_cast<unsigned int*>(rowMapDev),
-                        depth_offset);
-                hipEventRecord(event_array[total_enqueues * 2 + 1], NULL);
-                #endif
-                total_enqueues++;
-            }
-
-            // After we cross this clFinish, all of the kernel invocations have
-            // completed, and the final answer is in yDev. Now we should add up
-            // all of the kernel runtimes from all levels to see how long this
-            // levelset solve took.
-            this->GPU->Flush();
-            for (int this_enqueue = 0; this_enqueue < total_enqueues; this_enqueue++)
-            {
-                #ifndef USE_HIP
-                CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-                total_kern_time += CL->ComputeTime(event_array[this_enqueue]);
-                levelsync_kern_time += CL->ComputeTime(event_array[this_enqueue]);
-                #else
-                float elapsed;
-                hipEventElapsedTime(&elapsed, event_array[this_enqueue * 2], event_array[this_enqueue * 2 + 1]);
-                total_kern_time += elapsed * 1000000;
-                levelsync_kern_time += elapsed * 1000000;
-                #endif
-            }
-            // The analyze kernel is about 15% slower than the syncfree kernel.
-            // As such, if the level-sync verseion is < 15% faster, it's likely
-            // that syncfree will win. Let's go back to doing that.
-            if (i == 1 && (analyze_kern_time < (levelsync_kern_time * 1.15)))
-                syncfree_better = true;
-            this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL);
-            errors_seen[i] = VerifyResults(i);
-        }
-        else // Run level-set algorithm
-        {
-            // This is the level-set SpTS kernel, which can be done after the
-            // first analyze-and-solve kernel. In this case, we know the levels
-            // that each row is in, so we can launch one kernel per level with
-            // exactly the right number of workgroups (one WG per row).
-            // This means that we don't have any in-kernel atomics, spin-loops,
-            // etc, so things run much faster. However, we much launch a
-            // potentially large number of kernels.
-            // Number of levels is maxDepth. */
-            levelset_iter++;
-
-            #ifdef USE_ROCSHMEM
-            fprintf(stderr, "rocSHMEM not supported for selected algorithm\n");
-            exit(-1);
-            #endif
-
-           // Keep track of total kernels we launch so we can watch for events.
-            int total_enqueues = 0;
-
-            unsigned int start_level = 0;
-            unsigned int end_level = 0;
-            unsigned int in_a_run = 0;
-            unsigned int running_total = 0;
-
-            // How far into the rowMap that lists which rows are in each level
-            unsigned int depth_offset = 0;
-
-            unsigned int total_vector = 0;
-            unsigned int total_levelset = 0;
-            for (int this_level = 0; this_level < maxDepth; this_level++)
-            {
-                unsigned int inner_depth_offset;
-                if (this_level == 0)
-                    inner_depth_offset = 0;
-                else
-                    inner_depth_offset = numRowsAtLevel[this_level-1];
-                unsigned int total_in_this_depth = numRowsAtLevel[this_level] - inner_depth_offset;
-
-                if (total_in_this_depth == 0)
-                    continue;
-
-                end_level = this_level;
-                // Comment out this if(){} section to force us to always
-                // launch the levelset kernel.
-                if (total_in_this_depth <= 2*WF_PER_WG)
-                {
-                    running_total += total_in_this_depth;
-                    if (in_a_run == 0)
-                    {
-                        start_level = this_level;
-                        depth_offset = inner_depth_offset;
-                        in_a_run = 1;
-                    }
-                }
-                else
-                {
-                    if (in_a_run)
-                    {
-                        global_work_size = WF_SIZE * WF_PER_WG;
-                        #ifndef USE_HIP
-                        CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-                        CL->SetArgs(CLHelper::SpTSKernel_vector, 0,
-                                bufNonZeroes,
-                                bufColumnIndices,
-                                bufRowPtrs,
-                                xDev,
-                                yDev,
-                                alpha,
-                                rowMapDev,
-                                numRowsAtLevelDev,
-                                depth_offset,
-                                start_level,
-                                end_level);
-                        status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_vector, 1, NULL, &global_work_size, &global_work_size, 0, NULL, &event_array[total_enqueues]);
-                        this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed");
-                        #else
-                        int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                                / total_workitems_per_workgroup;
-                        hipEventRecord(event_array[total_enqueues * 2], NULL);
-                        hipLaunchKernelGGL(amd_spts_vector_solve,
-                                dim3(num_of_workgroups),
-                                dim3(total_workitems_per_workgroup),
-                                0, 0,
-                                global_work_size,
-                                static_cast<FPTYPE *>(bufNonZeroes),
-                                static_cast<int *>(bufColumnIndices),
-                                static_cast<int *>(bufRowPtrs),
-                                static_cast<FPTYPE *>(xDev),
-                                static_cast<FPTYPE *>(yDev),
-                                alpha,
-                                static_cast<unsigned int *>(rowMapDev),
-                                static_cast<unsigned int *>(numRowsAtLevelDev),
-                                depth_offset,
-                                start_level,
-                                end_level);
-                        hipEventRecord(event_array[total_enqueues * 2 + 1], NULL);
-                        #endif
-                        total_enqueues++;
-                        //printf("\n\tVector. offset %u Start %u End %u Rows in this enq %u\n", depth_offset, start_level, end_level, running_total);
-                        in_a_run = start_level = end_level = running_total = 0;
-                        depth_offset = numRowsAtLevel[this_level-1];
-                        total_vector++;
-                    }
-                    global_work_size = WF_SIZE * total_in_this_depth;
-                    #ifndef USE_HIP
-                    CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-                    CL->SetArgs(CLHelper::SpTSKernel_levelset, 0,
-                            bufNonZeroes,
-                            bufColumnIndices,
-                            bufRowPtrs,
-                            xDev,
-                            yDev,
-                            rowMapDev,
-                            depth_offset,
-                            alpha);
-                    status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_levelset, 1, NULL, &global_work_size, NULL, 0, NULL, &event_array[total_enqueues]);
-                    this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed");
-                    #else
-                    int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                            / total_workitems_per_workgroup;
-                    hipEventRecord(event_array[total_enqueues * 2], NULL);
-                    hipLaunchKernelGGL(amd_spts_levelset_solve,
-                            dim3(num_of_workgroups),
-                            dim3(total_workitems_per_workgroup),
-                            0, 0,
-                            global_work_size,
-                            static_cast<FPTYPE *>(bufNonZeroes),
-                            static_cast<int *>(bufColumnIndices),
-                            static_cast<int *>(bufRowPtrs),
-                            static_cast<FPTYPE *>(xDev),
-                            static_cast<FPTYPE *>(yDev),
-                            static_cast<unsigned int *>(rowMapDev),
-                            depth_offset,
-                            alpha);
-                    hipEventRecord(event_array[total_enqueues * 2 + 1], NULL);
-                    #endif
-                    total_enqueues++;
-                    depth_offset = numRowsAtLevel[this_level];
-                    total_levelset++;
-                }
-            }
-            end_level++;
-            if (in_a_run)
-            {
-                #ifndef USE_HIP
-                CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-                CL->SetArgs(CLHelper::SpTSKernel_vector, 0,
-                        bufNonZeroes,
-                        bufColumnIndices,
-                        bufRowPtrs,
-                        xDev,
-                        yDev,
-                        alpha,
-                        rowMapDev,
-                        numRowsAtLevelDev,
-                        depth_offset,
-                        start_level,
-                        end_level);
-                global_work_size = WF_SIZE * WF_PER_WG;
-                status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_vector, 1, NULL, &global_work_size, &global_work_size, 0, NULL, &event_array[total_enqueues]);
-                this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed");
-                #else
-                int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1)
-                                        / total_workitems_per_workgroup;
-                hipEventRecord(event_array[total_enqueues * 2], NULL);
-                hipLaunchKernelGGL(amd_spts_vector_solve,
-                        dim3(num_of_workgroups),
-                        dim3(total_workitems_per_workgroup),
-                        0, 0,
-                        global_work_size,
-                        static_cast<FPTYPE *>(bufNonZeroes),
-                        static_cast<int *>(bufColumnIndices),
-                        static_cast<int *>(bufRowPtrs),
-                        static_cast<FPTYPE *>(xDev),
-                        static_cast<FPTYPE *>(yDev),
-                        alpha,
-                        static_cast<unsigned int *>(rowMapDev),
-                        static_cast<unsigned int *>(numRowsAtLevelDev),
-                        depth_offset,
-                        start_level,
-                        end_level);
-                hipEventRecord(event_array[total_enqueues * 2 + 1], NULL);
-                #endif
-                total_enqueues++;
-                //printf("\n\tVector. offset %u Start %u End %u Rows in this enq %u\n", depth_offset, start_level, end_level, running_total);
-                in_a_run = start_level = end_level = running_total = 0;
-                total_vector++;
-            }
-
-            if (i == 1)
-                printf("\nTotal Vector: %u\nTotal levelset: %u\n", total_vector, total_levelset);
-            // After we cross this clFinish, all of the kernel invocations have
-            // completed, and the final answer is in yDev. Now we should add up
-            // all of the kernel runtimes from all levels to see how long this
-            // levelset solve took.
-            this->GPU->Flush();
-            for (int this_enqueue = 0; this_enqueue < total_enqueues; this_enqueue++)
-            {
-                #ifndef USE_HIP
-                CLHelper *CL = dynamic_cast<CLHelper*>(this->GPU);
-                total_kern_time += CL->ComputeTime(event_array[this_enqueue]);
-                levelset_kern_time += CL->ComputeTime(event_array[this_enqueue]);
-                #else
-                float elapsed;
-                hipEventElapsedTime(&elapsed, event_array[this_enqueue * 2], event_array[this_enqueue * 2 + 1]);
-                total_kern_time += elapsed * 1000000;
-                levelset_kern_time += elapsed * 1000000;
-                #endif
-            }
-            this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL);
-            errors_seen[i] = VerifyResults(i);
-        }
-
-#ifndef ALL_SYNCFREE
-        if (i == 1)
-            printf("\nmaxDepth %d\n", maxDepth);
-#endif
-        if (i != (iter - 1))
-        {
-            this->GPU->CopyToDevice(yDev, y_zero, nRows*sizeof(FloatType), 0, GPU_FALSE, NULL);
-            this->GPU->CopyToDevice(doneArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL);
-            this->GPU->Flush();
-        }
-    }
-
-    float gflops = 0.f;
-    printf("\n\nnnz: %d\n", nNZ);
-    gflops = (float)(2 * nNZ) / (float)(total_kern_time/iter);
-    ns_per_iter = total_kern_time/iter;
-
-    if (analysis_iter > 0)
-        ns_per_analysis_iter = analyze_kern_time / analysis_iter;
-    else
-        ns_per_analysis_iter = 0;
-    if (syncfree_iter > 0)
-        ns_per_syncfree_iter = syncfree_kern_time / syncfree_iter;
-    else
-        ns_per_syncfree_iter = 0;
-    if (levelset_iter > 0)
-        ns_per_levelset_iter = levelset_kern_time / levelset_iter;
-    else
-        ns_per_levelset_iter = 0;
-    if (levelsync_iter > 0)
-        ns_per_levelsync_iter = levelsync_kern_time / levelsync_iter;
-    else
-        ns_per_levelsync_iter = 0;
-
-    this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL);
-
-    if (doneArray)
-        free(doneArray);
-    if (numRowsAtLevel)
-        free(numRowsAtLevel);
-    if (rowMap)
-        free(rowMap);
-    if (event_array)
-        free(event_array);
-
-    return gflops;
-}
-
-#endif //SpTS_H
diff --git a/projects/rocshmem/internal/clients/spts/SparseMatrix.h b/projects/rocshmem/internal/clients/spts/SparseMatrix.h
deleted file mode 100644
index bd36d65a00..0000000000
--- a/projects/rocshmem/internal/clients/spts/SparseMatrix.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-#ifndef SparseMatrix_H
-#define SparseMatrix_H
-
-#include "GPUHelper.h"
-#ifndef USE_HIP
-#include "OpenCLHelper.h"
-#include <CL/cl.h>
-#else
-#include "HIPHelper.h"
-#endif
-
-#include "InputFlags.h"
-#include "MatrixMarketReader.h"
-#include "OpenCLHelper.h"
-#include <algorithm>
-#include <cassert>
-
-template<typename FloatType>
-class SparseMatrix 
-{
-	
-    public:
-	int nRows;
-	int nCols;
-	int nNZ;
-	
-	int *cols;
-	int *row_ptrs;
-	
-	FloatType *vals;
-	
-	memPointer d_cols;
-	memPointer d_vals;
-	memPointer d_row_ptrs;
-
-    // info about parallel procs
-    int this_pe;
-    int total_pes;
-
-    int nRows_p;
-    int nCols_p;
-
-    protected:
-
-    GPUHelper *GPU;
-
-	public:
-
-	SparseMatrix() : nRows(0), nCols(0), nNZ(0), nRows_p(0), nCols_p(0)
-	{
-		cols = NULL;
-		row_ptrs = NULL;
-		vals = NULL;
-
-        d_cols = NULL;
-        d_vals = NULL;
-        d_row_ptrs = NULL;
-
-        this_pe = -1;//rocshmem_my_pe(handle); // this pe
-        total_pes = -1;//rocshmem_n_pes(handle);  // total number of pes
-
-    }
-	void AllocateSparseMatrix(MatrixMarketReader<FloatType> &mm_reader,
-						InputFlags &in_flags,
-						GPUHelper *gpu);
-    void AllocateParallelSparseMatrix(MatrixMarketReader<FloatType> &mm_reader,
-            InputFlags &in_flags);
-	void ConvertFromCOOToCSR(Coordinate<FloatType> *coords,
-						InputFlags &in_flags);
-
-    void PopulateParallelSparseMatrix(MatrixMarketReader<FloatType> &mm_reader,
-            InputFlags &in_flags);
-
-    void FindStatsForParallelDecomposition();
-
-    void Set_total_pes(int val){
-        this->total_pes = val;
-    }
-    void Set_this_pe(int val){
-        this->this_pe = val;
-    }
-
-    int Get_total_pes(){
-        return this->total_pes;
-    }
-    int Get_this_pe(){
-        return this->this_pe;
-    }
-
-    int GetNumRows_p() {return nRows_p;}
-
-	int *GetCols() { return cols; }
-	FloatType *GetVals() { return vals; }
-	int *GetRowPtrs() { return row_ptrs; }
-
-	memPointer GetDevCols() {return d_cols; }
-	memPointer GetDevVals() {return d_vals; }
-	memPointer GetDevRowPtrs() {return d_row_ptrs; }
-
-	~SparseMatrix()
-	{
-		delete[] cols;
-		delete[] vals;
-		delete[] row_ptrs;
-
-		GPU->FreeMem(d_cols);
-		GPU->FreeMem(d_vals);
-		GPU->FreeMem(d_row_ptrs);
-	}
-};
-
-template<typename FloatType>
-void SparseMatrix<FloatType>::AllocateSparseMatrix(MatrixMarketReader<FloatType> &mm_reader,
-					InputFlags &in_flags,
-					GPUHelper *gpu)
-{
-    GPU = gpu;
-	nRows = mm_reader.GetNumRows();
-	nCols = mm_reader.GetNumCols();
-	nNZ = mm_reader.GetNumNonZeroes();
-    printf("Allocating a sparse matrix with-- nRows: %d nCols: %d nNZ: %d\n", nRows, nCols, nNZ);
-
-    assert(total_pes != -1);
-    assert(this_pe != -1);
-
-    #ifdef USE_RO_SHMEM
-    if (nRows != nCols){
-        fprintf(stderr, "RO_SHMEM port requires the global matrix to be "
-                "square!\n");
-        exit(-1);
-    }
-    #endif
-
-	cols = new int[nNZ];
-    if (cols == NULL)
-    {
-        fprintf(stderr, "Failed to allocate host-side cols array !\n");
-        exit(-1);
-    }
-	vals = new FloatType[nNZ];
-    if (vals == NULL)
-    {
-        fprintf(stderr, "Failed to allocate host-side vals array !\n");
-        exit(-1);
-    }
-	row_ptrs = new int[nRows + 1];
-    if (row_ptrs == NULL)
-    {
-        fprintf(stderr, "Failed to allocate host-side row_ptrs array !\n");
-        exit(-1);
-    }
-}
-
-template<typename FloatType>
-bool CoordinateCompare(const Coordinate<FloatType> &c1, const Coordinate<FloatType> &c2)
-{
-	if(c1.x != c2.x)
-		return (c1.x < c2.x);
-	else
-		return (c1.y < c2.y);
-}
-
-template<typename FloatType>
-void SparseMatrix<FloatType>::ConvertFromCOOToCSR(Coordinate<FloatType> *coords,
-					InputFlags &in_flags)
-{
-	std::sort(coords, coords + nNZ, CoordinateCompare<FloatType>);
-
-	int current_row = 1;
-    bool has_seen_diagonal = false;
-	row_ptrs[0] = 0;
-	for (int i = 0; i < nNZ; i++)
-	{
-		cols[i] = coords[i].y;
-		vals[i] = coords[i].val;
-        //fprintf(stderr,"Row %d Col %d Val %lf (cur_row: %d)\n", coords[i].x, coords[i].y, coords[i].val, current_row-1);
-
-		while(coords[i].x >= current_row)
-        {
-            // We've reached the end of a row. Did we see a diagonal?
-            // If not, the triangular solve will be underconstrained.
-            if (!has_seen_diagonal)
-            {
-                fprintf(stderr, "ERROR Converting the COO to CSR.\n");
-                fprintf(stderr, "\tMissing diagonal on row %d\n", current_row-1);
-                exit(-1);
-            }
-            has_seen_diagonal = false;
-			row_ptrs[current_row] = i;
-            current_row++;
-        }
-        if (coords[i].x == coords[i].y)
-            has_seen_diagonal = true;
-
-	}
-	row_ptrs[current_row++] = nNZ;
-    while (current_row <= nRows)
-    {
-        if (!has_seen_diagonal)
-        {
-            fprintf(stderr, "ERROR Converting the COO to CSR.\n");
-            fprintf(stderr, "\tNo values on row %d, so no diagonal.\n", current_row-1);
-            exit(-1);
-        }
-        has_seen_diagonal = false;
-        row_ptrs[current_row++] = nNZ;
-    }
-}
-
-template<typename FloatType>
-void SparseMatrix<FloatType>::AllocateParallelSparseMatrix(MatrixMarketReader<FloatType> &mm_reader,
-        InputFlags &in_flags)
-{
-    d_cols = GPU->AllocateMem("cols", nNZ*sizeof(int), 0, NULL);
-    d_vals = GPU->AllocateMem("vals", nNZ*sizeof(FloatType), 0, NULL);
-    d_row_ptrs = GPU->AllocateMem("row_ptrs", (nRows+1)*sizeof(int), 0, NULL);
-}
-
-template<typename FloatType>
-void SparseMatrix<FloatType>::FindStatsForParallelDecomposition()
-{
-
-    assert(SPTS_BLOCK_SIZE % 64 == 0);
-
-    // Rows left over in the potentially partial final block
-    int left_over_last_block = nRows % SPTS_BLOCK_SIZE;
-    printf("%d: lolb %d\n", this_pe, left_over_last_block);
-    // Number of complete blocks, not including any partial block at the end
-    int total_blocks = nRows / SPTS_BLOCK_SIZE;
-    printf("%d: totb %d\n", this_pe, total_blocks);
-
-    // Everyone has at least this many rows
-    nRows_p = (total_blocks / total_pes) * SPTS_BLOCK_SIZE;
-    printf("%d: initial nRows_p %d\n", this_pe, nRows_p);
-
-    // Last cycle might not assign to all PEs
-    int straggler_blocks = total_blocks % total_pes;
-    if (this_pe < straggler_blocks)
-        nRows_p += SPTS_BLOCK_SIZE;
-    printf("%d: straggler nRows_p %d\n", this_pe, nRows_p);
-    
-    // Last block of last cycle might have less than SPTS_BLOCK_SIZE rows
-    if (left_over_last_block) {
-        int final_pe = ((total_blocks + 1) % total_pes) - 1;
-        if (final_pe == -1)
-            final_pe = total_pes - 1;
-        if (this_pe == final_pe)
-            nRows_p += left_over_last_block;
-    }
-    printf("%d: final nRows_p %d\n", this_pe, nRows_p);
-
-    if (nRows_p <= 0) {
-        fprintf(stderr, "Block Size %d too small for input row size %d with "
-                "%d number of nodes.  Please decrease the block size or "
-                "decrease the number of nodes\n", SPTS_BLOCK_SIZE, nRows,
-                total_pes);
-        exit(-1);
-    }
-
-    // print to check!
-    printf("\nPE: %d total_rows: %d my_rows: %d\n", this_pe, nRows, nRows_p);
-
-    nCols_p = nCols; // 1D decomposition
-}
-
-#endif
diff --git a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_hip b/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_hip
deleted file mode 100755
index 03b392ace3..0000000000
--- a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_hip
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-src_path=$(dirname "$(realpath $0)")/..
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DUSE_ROCSHMEM=OFF \
-    -DUSE_HIP=ON \
-    -DALL_ANALYZE=ON \
-    -DUSE_DOUBLE=OFF \
-    -DALL_LEVELSET=OFF \
-    -DALL_LEVELSYNC=OFF \
-    -DALL_SYNCFREE=OFF \
-    $src_path
-cmake --build . --parallel 8
diff --git a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_opencl b/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_opencl
deleted file mode 100755
index 41db75f17f..0000000000
--- a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_opencl
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-src_path=$(dirname "$(realpath $0)")/..
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DUSE_ROCSHMEM=OFF \
-    -DUSE_HIP=OFF \
-    -DALL_ANALYZE=ON \
-    -DUSE_DOUBLE=OFF \
-    -DALL_LEVELSET=OFF \
-    -DALL_LEVELSYNC=OFF \
-    -DALL_SYNCFREE=OFF \
-    $src_path
-cmake --build . --parallel 8
diff --git a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_rocshmem b/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_rocshmem
deleted file mode 100755
index c542aec341..0000000000
--- a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_rocshmem
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-if [ -z $1 ]
-then
-  install_path=~/rocshmem
-else
-  install_path=$1
-fi
-
-src_path=$(dirname "$(realpath $0)")/..
-
-cmake \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_VERBOSE_MAKEFILE=OFF \
-    -DUSE_ROCSHMEM=ON \
-    -DUSE_HIP=ON \
-    -DALL_ANALYZE=ON \
-    -DUSE_DOUBLE=OFF \
-    -DALL_LEVELSET=OFF \
-    -DALL_LEVELSYNC=OFF \
-    -DALL_SYNCFREE=OFF \
-    -Drocshmem_DIR=$install_path/share/cmake/rocshmem \
-    $src_path
-cmake --build . --parallel 8
diff --git a/projects/rocshmem/internal/clients/spts/config.h.in b/projects/rocshmem/internal/clients/spts/config.h.in
deleted file mode 100644
index a9d4d814a2..0000000000
--- a/projects/rocshmem/internal/clients/spts/config.h.in
+++ /dev/null
@@ -1,7 +0,0 @@
-#cmakedefine USE_ROCSHMEM
-#cmakedefine USE_HIP
-#cmakedefine ALL_ANALYZE
-#cmakedefine USE_DOUBLE
-#cmakedefine ALL_LEVELSET
-#cmakedefine ALL_LEVELSYNC
-#cmakedefine ALL_SYNCFREE
diff --git a/projects/rocshmem/internal/clients/spts/driver.sh b/projects/rocshmem/internal/clients/spts/driver.sh
deleted file mode 100755
index 418ebdd525..0000000000
--- a/projects/rocshmem/internal/clients/spts/driver.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-#!/bin/bash
-
-echo Test Name $2
-
-INPUTS=/mnt/mlebeane/spts_data
-
-case $2 in
-    *"single_thread")
-        mpirun -np 2 $1 -f $INPUTS/test_matrices/diagonal_large.mtx -a 2 -b 512 -p 64 -v -i 3 > $3/diagonal_large_bput.log
-        mpirun -np 2 $1 -f $INPUTS/test_matrices/not_quite_diagonal.mtx -a 2 -b 256 -p 64 -v -i 3 > $3/not_quite_diagonal_bput.log
-        ;;
-    *"multi_thread")
-        mpirun -np 2 $1 -f $INPUTS/test_matrices/diagonal_large.mtx -a 2 -b 512 -p 64 -v -i 3 > $3/diagonal_large_bput.log
-        mpirun -np 2 $1 -f $INPUTS/test_matrices/not_quite_diagonal.mtx -a 2 -b 256 -p 64 -v -i 3 > $3/not_quite_diagonal_bput.log
-        mpirun -np 2 $1 -f $INPUTS/test_matrices/not_quite_diagonal.mtx -a 1 -b 256 -v -i 3 > $3/not_quite_diagonal_get.log
-        ;;
-    *)
-        echo "UNKNOWN TEST TYPE: $2"
-        exit -1
-        ;;
-esac
-
-exit $?
diff --git a/projects/rocshmem/internal/clients/spts/mmio.h b/projects/rocshmem/internal/clients/spts/mmio.h
deleted file mode 100644
index b83946d231..0000000000
--- a/projects/rocshmem/internal/clients/spts/mmio.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#ifndef MM_IO_H
-#define MM_IO_H
-
-/********************* MM_typecode query fucntions ***************************/
-
-#define mm_is_matrix(typecode)	((typecode)[0]=='M')
-
-#define mm_is_sparse(typecode)	((typecode)[1]=='C')
-#define mm_is_coordinate(typecode)((typecode)[1]=='C')
-#define mm_is_dense(typecode)	((typecode)[1]=='A')
-#define mm_is_array(typecode)	((typecode)[1]=='A')
-
-#define mm_is_complex(typecode)	((typecode)[2]=='C')
-#define mm_is_real(typecode)		((typecode)[2]=='R')
-#define mm_is_pattern(typecode)	((typecode)[2]=='P')
-#define mm_is_integer(typecode) ((typecode)[2]=='I')
-
-#define mm_is_symmetric(typecode)((typecode)[3]=='S')
-#define mm_is_general(typecode)	((typecode)[3]=='G')
-#define mm_is_skew(typecode)	((typecode)[3]=='K')
-#define mm_is_hermitian(typecode)((typecode)[3]=='H')
-
-/********************* MM_typecode modify fucntions ***************************/
-
-#define mm_set_matrix(typecode)	((typecode)[0]='M')
-#define mm_set_coordinate(typecode)	((typecode)[1]='C')
-#define mm_set_array(typecode)	((typecode)[1]='A')
-#define mm_set_dense(typecode)	mm_set_array(typecode)
-#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
-
-#define mm_set_complex(typecode)((typecode)[2]='C')
-#define mm_set_real(typecode)	((typecode)[2]='R')
-#define mm_set_pattern(typecode)((typecode)[2]='P')
-#define mm_set_integer(typecode)((typecode)[2]='I')
-
-
-#define mm_set_symmetric(typecode)((typecode)[3]='S')
-#define mm_set_general(typecode)((typecode)[3]='G')
-#define mm_set_skew(typecode)	((typecode)[3]='K')
-#define mm_set_hermitian(typecode)((typecode)[3]='H')
-
-#define mm_clear_typecode(typecode) ((typecode)[0]=(typecode)[1]= \
-									(typecode)[2]=' ',(typecode)[3]='G')
-
-#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
-
-
-/********************* Matrix Market error codes ***************************/
-
-
-#define MM_COULD_NOT_READ_FILE	11
-#define MM_PREMATURE_EOF		12
-#define MM_NOT_MTX				13
-#define MM_NO_HEADER			14
-#define MM_UNSUPPORTED_TYPE		15
-#define MM_LINE_TOO_LONG		16
-#define MM_COULD_NOT_WRITE_FILE	17
-
-#define MM_MTX_STR		"matrix"
-#define MM_ARRAY_STR	"array"
-#define MM_DENSE_STR	"array"
-#define MM_COORDINATE_STR "coordinate" 
-#define MM_SPARSE_STR	"coordinate"
-#define MM_COMPLEX_STR	"complex"
-#define MM_REAL_STR		"real"
-#define MM_INT_STR		"integer"
-#define MM_GENERAL_STR  "general"
-#define MM_SYMM_STR		"symmetric"
-#define MM_HERM_STR		"hermitian"
-#define MM_SKEW_STR		"skew-symmetric"
-#define MM_PATTERN_STR  "pattern"
-
-#define MM_MAX_LINE_LENGTH 1025
-#define MM_MAX_TOKEN_LENGTH 64
-#define MatrixMarketBanner "%%MatrixMarket"
-#define MAX_RAND_VAL 5.0
-
-#endif
diff --git a/projects/rocshmem/internal/clients/spts/spts_kernel.h b/projects/rocshmem/internal/clients/spts/spts_kernel.h
deleted file mode 100644
index 69a7c458bb..0000000000
--- a/projects/rocshmem/internal/clients/spts/spts_kernel.h
+++ /dev/null
@@ -1,2107 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- ********************************************************************************/
-
-#include "GPUHelper.h"
-
-#include <hip/hip_runtime.h>
-#include <hip/math_functions.h>
-#include <hip/device_functions.h>
-
-#ifdef USE_ROCSHMEM
-#include "rocshmem.hpp"
-using namespace rocshmem;
-#endif
-
-#ifndef WF_PER_WG
-#error "WF_PER_WG undefined!"
-#endif
-
-#ifndef WF_SIZE
-#error "WF_SIZE undefind!"
-#endif
-
-#define as_uint (unsigned int)
-#define as_ulong (unsigned long long)
-#define as_float (float)
-
-#ifdef USE_DOUBLE
-typedef double FPTYPE;
-#else
-typedef float FPTYPE;
-#endif
-
-// GCN3 and below require slightly different inline asm than Vega
-// v_add_u32 requires a "vcc" register output modifier on GCN3, but not on Vega
-// global_load_ in Vega is required to be flat_load_ in GCN3 and below.
-// Same for global_store_ and flat_store_.
-// However, the global_ instructions require an "off" modifier.
-#if defined(GCN3) || defined(GCN2)
-#define VCC "vcc"
-#define MEM_PREFIX "flat"
-#define OFF_MODIFIER ""
-#else
-#define VCC ""
-#define MEM_PREFIX "global"
-#define OFF_MODIFIER "off"
-#endif
-
-#ifndef GCN2
-#define LGKMCNT_0 0xc07f // GCN3 added more VMCNT bits at the upper end of the SIMM16
-#define WAKEUP "s_wakeup"
-#else
-#define LGKMCNT_0 0x7f
-#define WAKEUP "" // s_wakeup not supported on old GPUs
-#endif
-
-#define __builtin_amdgcn_ds_bpermute __hip_ds_bpermute
-#define __builtin_amdgcn_ds_swizzle __hip_ds_swizzle
-#define __builtin_amdgcn_mov_dpp __hip_move_dpp
-
-#define HIP_ENABLE_PRINTF
-
-// Internal functions to wrap atomics, depending on if we support 64-bit
-// atomics or not. Helps keep the code clean in the other parts of the code.
-// All of the 32-bit atomics are built assuming we're on a little endian architecture.
-__device__
-inline unsigned long spts_atomic_cmpxchg(unsigned long long *const ptr,
-                                    const unsigned long long compare,
-                                    const unsigned long long val)
-{
-#ifdef USE_DOUBLE
-	return atomicCAS(ptr, compare, val);
-#else
-	return atomicCAS(ptr, compare, val);
-#endif
-}
-
-__device__
-void atomic_set (FPTYPE *ptr, FPTYPE temp)
-{
-#ifdef USE_DOUBLE
-    unsigned long long newVal;
-    unsigned long long prevVal;
-    do
-    {
-        prevVal = as_ulong(*ptr);
-        newVal = as_ulong(temp);
-    } while (spts_atomic_cmpxchg((unsigned long long *)ptr, prevVal, newVal) != prevVal);
-
-#else
-    unsigned long long newVal;
-    unsigned long long prevVal;
-    do
-    {
-        prevVal = as_uint(*ptr);
-        newVal = as_uint(temp);
-    } while (spts_atomic_cmpxchg((unsigned long long *)ptr, prevVal, newVal) != prevVal);
-#endif
-}
-
-__device__
-inline void atomic_set_done(uint * done_array, uint row, uint val_to_set)
-{
-    atomicOr(&(done_array[row]), val_to_set);
-}
-
-__device__
-inline unsigned int atomic_get_done(uint * done_array, uint val_to_check)
-{
-    return atomicOr(&(done_array[val_to_check]), 0x0);
-}
-
-// Use a traditional LDS-based reduction to have all of the threads in the wave
-// add their values into OUTPUT_THREAD's variable.
-__device__
-FPTYPE lds_reduction(FPTYPE temp_sum, __shared__ FPTYPE *lds,
-        unsigned int start_of_this_row, unsigned int end_of_this_row,
-        unsigned int wg_lid)
-{
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-    // Have all the threads in a workgroup reduce their data into a single
-    // value that's then read by the lead thread
-    // We start by calculating how many layers of reduction we actually need.
-    // If this is a very short row (smaller than our wavefront size), then we don't need
-    // to do all iterations of the below loop.
-    unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE);
-    // find next highest power of two. So if we have 5 things to reduce, we need to
-    // do a reduction from 8 threads' values. The last 3 will be '0'
-    num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1));
-
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        lds[wg_lid] = temp_sum;
-        asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-
-        if (lid < i)
-            temp_sum += lds[wg_lid + i];
-        asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-    }
-    // at this point, thread 0's "temp_sum" contains the final useful value.
-    return temp_sum;
-}
-
-// Use a traditional LDS-based reduction to have all of the threads in the wave
-// add their values into OUTPUT_THREAD's variable.
-// It hides the max work behind the same s_waitcnt on local memory,
-// so it should be faster than calling the reduce function twice in a row.
-__device__
-FPTYPE lds_reduction_two(FPTYPE temp_sum, unsigned int row_max_depth,
-        __shared__ FPTYPE *lds, __shared__ unsigned int *max_depth,
-        unsigned int start_of_this_row, unsigned int end_of_this_row,
-        unsigned int wg_lid)
-{
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-    // Have all the threads in a workgroup reduce their data into a single
-    // value that's then read by the lead thread
-    // We start by calculating how many layers of reduction we actually need.
-    // If this is a very short row (smaller than our wavefront size), then we don't need
-    // to do all iterations of the below loop.
-    unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE);
-    // find next highest power of two. So if we have 5 things to reduce, we need to
-    // do a reduction from 8 threads' values. The last 3 will be '0'
-    num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1));
-
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        lds[wg_lid] = temp_sum;
-        max_depth[wg_lid] = row_max_depth;
-         asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-        if (lid < i)
-        {
-            temp_sum += lds[wg_lid + i];
-            row_max_depth = max(row_max_depth, max_depth[wg_lid + i]);
-        }
-         asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-    }
-    // at this point, max_depth[thread_0_within_each_wavefront]
-    // contains the useful maximum depth for this row.
-    max_depth[wg_lid] = row_max_depth;
-    // at this point, thread 0's "temp_sum" contains the final useful value.
-    return temp_sum;
-}
-
-// Use a traditional LDS-based reduction to have all of the threads in the wave
-// add their values into OUTPUT_THREAD's variable.
-// It hides the max work behind the same s_waitcnt on local memory,
-// so it should be faster than calling the reduce function three times in a row.
-__device__
-FPTYPE lds_reduction_three(FPTYPE temp_sum, unsigned int row_max_depth,
-        unsigned int spin_times, __shared__ FPTYPE *lds,
-        __shared__ unsigned int *max_depth, __shared__ unsigned int *total_spins,
-        unsigned int start_of_this_row, unsigned int end_of_this_row,
-        unsigned int wg_lid)
-{
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-    // Have all the threads in a workgroup reduce their data into a single
-    // value that's then read by the lead thread
-    // We start by calculating how many layers of reduction we actually need.
-    // If this is a very short row (smaller than our wavefront size), then we don't need
-    // to do all iterations of the below loop.
-    unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE);
-    // find next highest power of two. So if we have 5 things to reduce, we need to
-    // do a reduction from 8 threads' values. The last 3 will be '0'
-    num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1));
-
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        lds[wg_lid] = temp_sum;
-        max_depth[wg_lid] = row_max_depth;
-        total_spins[wg_lid] = spin_times;
-         asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-        if (lid < i)
-        {
-            temp_sum += lds[wg_lid + i];
-            row_max_depth = max(row_max_depth, max_depth[wg_lid + i]);
-            spin_times += total_spins[wg_lid + i];
-        }
-         asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-    }
-    // at this point, max_depth[thread_0_within_each_wavefront]
-    // contains the useful maximum depth for this row.
-    max_depth[wg_lid] = row_max_depth;
-    // and total_spins[thread_0_within_each_wavefront] has its
-    // total number of spin-loops.
-    total_spins[wg_lid] = spin_times;
-    // at this point, thread 0's "temp_sum" contains the final useful value.
-    return temp_sum;
-}
-
-// Do a reduction using bpermute instructions.
-// This is strictly worse than Swizzle-based reduction, since it is slower and
-// only works on the same hardware as the swizzle instructions.
-__device__
-FPTYPE bpermute_reduction(FPTYPE temp_sum, unsigned int start_of_this_row,
-        unsigned int end_of_this_row, unsigned int wg_lid)
-{
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-    // Have all the threads in a workgroup reduce their data into a single
-    // value that's then read by the lead thread
-    // We start by calculating how many layers of reduction we actually need.
-    // If this is a very short row (smaller than our workgroup size), then we don't need
-    // to do all iterations of the below loop.
-    unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE);
-    // find next highest power of two. So if we have 5 things to reduce, we need to
-    // do a reduction from 8 threads' values. The last 3 will be '0'
-    num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1));
-
-#ifdef USE_DOUBLE
-    typedef union dbl_b32 {
-        double val;
-        uint2 b32;
-    } dbl_b32_t;
-    dbl_b32_t t_temp_sum;
-    t_temp_sum.val = temp_sum;
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        int pull_from = (lid + i) << 2;
-        dbl_b32_t upper_sum;
-        upper_sum.b32.x = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.x);
-        upper_sum.b32.y = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.y);
-        t_temp_sum.val += upper_sum.val;
-    }
-    temp_sum = t_temp_sum.val;
-#else // !USE_DOUBLE
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        uint pull_from = (lid + i) << 2;
-        temp_sum += as_float(__builtin_amdgcn_ds_bpermute(pull_from, as_uint(temp_sum)));
-    }
-#endif // USE_DOUBLE
-    return temp_sum;
-}
-
-// Do a reduction using bpermute instructions.
-// This is strictly worse than Swizzle-based reduction, since it is slower and
-// only works on the same hardware as the swizzle instructions.
-// This version also does a max-reduce on the row_max_depth variable.
-// It hides this bpermute instruction behind the same s_waitcnt on local memory,
-// so it should be faster than calling the reduce function twice in a row.
-__device__
-FPTYPE bpermute_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth,
-        unsigned int start_of_this_row, unsigned int end_of_this_row,
-        unsigned int wg_lid)
-{
-    const unsigned int lid = wg_lid % WF_SIZE;
-    unsigned int max_depth = *row_max_depth;
-
-    // Have all the threads in a workgroup reduce their data into a single
-    // value that's then read by the lead thread
-    // We start by calculating how many layers of reduction we actually need.
-    // If this is a very short row (smaller than our workgroup size), then we don't need
-    // to do all iterations of the below loop.
-    unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE);
-    // find next highest power of two. So if we have 5 things to reduce, we need to
-    // do a reduction from 8 threads' values. The last 3 will be '0'
-    num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1));
-
-#ifdef USE_DOUBLE
-    typedef union dbl_b32 {
-        double val;
-        int2 b32;
-    } dbl_b32_t;
-    dbl_b32_t t_temp_sum;
-    t_temp_sum.val = temp_sum;
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        int pull_from = (lid + i) << 2;
-        dbl_b32_t upper_sum;
-        upper_sum.b32.x = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.x);
-        upper_sum.b32.y = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.y);
-        max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth)));
-        t_temp_sum.val += upper_sum.val;
-    }
-    temp_sum = t_temp_sum.val;
-#else // !USE_DOUBLE
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        int pull_from = (lid + i) << 2;
-        max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth)));
-        temp_sum += as_float(__builtin_amdgcn_ds_bpermute(pull_from, as_uint(temp_sum)));
-    }
-#endif // USE_DOUBLE
-    *row_max_depth = max_depth;
-    return temp_sum;
-}
-
-// Do a reduction using bpermute instructions.
-// This is strictly worse than Swizzle-based reduction, since it is slower and
-// only works on the same hardware as the swizzle instructions.
-// This version also does a max-reduce on the row_max_depth variable.
-// This version also does a max-add on the spin-loops per thread variable.
-// It hides this bpermute instruction behind the same s_waitcnt on local memory,
-// so it should be faster than calling the reduce function thrice in a row.
-__device__
-FPTYPE bpermute_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth,
-        unsigned int *spin_times, unsigned int start_of_this_row,
-        unsigned int end_of_this_row, unsigned int wg_lid)
-{
-    const unsigned int lid = wg_lid % WF_SIZE;
-    unsigned int max_depth = *row_max_depth;
-    unsigned int spin_time = *spin_times;
-
-    // Have all the threads in a workgroup reduce their data into a single
-    // value that's then read by the lead thread
-    // We start by calculating how many layers of reduction we actually need.
-    // If this is a very short row (smaller than our workgroup size), then we don't need
-    // to do all iterations of the below loop.
-    unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE);
-    // find next highest power of two. So if we have 5 things to reduce, we need to
-    // do a reduction from 8 threads' values. The last 3 will be '0'
-    num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1));
-
-#ifdef USE_DOUBLE
-    typedef union dbl_b32 {
-        double val;
-        int2 b32;
-    } dbl_b32_t;
-    dbl_b32_t t_temp_sum;
-    t_temp_sum.val = temp_sum;
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        int pull_from = (lid + i) << 2;
-        dbl_b32_t upper_sum;
-        upper_sum.b32.x = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.x);
-        upper_sum.b32.y = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.y);
-        max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth)));
-        spin_time += __builtin_amdgcn_ds_bpermute(pull_from, spin_time);
-        t_temp_sum.val += upper_sum.val;
-    }
-    temp_sum = t_temp_sum.val;
-#else // !USE_DOUBLE
-    for (int i = num_items >> 1; i > 0; i >>= 1)
-    {
-        int pull_from = (lid + i) << 2;
-        max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth)));
-        spin_time += __builtin_amdgcn_ds_bpermute(pull_from, spin_time);
-        temp_sum += as_float(__builtin_amdgcn_ds_bpermute(pull_from, as_uint(temp_sum)));
-    }
-#endif // USE_DOUBLE
-    *row_max_depth = max_depth;
-    *spin_times = spin_time;
-    return temp_sum;
-}
-
-// Swizzle-based reduction; this will work on Sea Islands
-/*
-FPTYPE swizzle_reduction(FPTYPE temp_sum)
-{
-#ifdef USE_DOUBLE
-    typedef union dbl_b32 {
-        double val;
-        int2 b32;
-    } dbl_b32_t;
-    dbl_b32_t upper_sum, t_temp_sum;
-
-    t_temp_sum.val = temp_sum;
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x80b1);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x80b1);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x804e);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x804e);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x101f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x101f);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x201f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x201f);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x401f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x401f);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_readlane(t_temp_sum.b32.x, 32);
-    upper_sum.b32.y = __builtin_amdgcn_readlane(t_temp_sum.b32.y, 32);
-    t_temp_sum.val += upper_sum.val;
-    temp_sum = t_temp_sum.val;
-#else // Swizzle-based for SPFP
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x80b1));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x804e));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x101f));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x201f));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x401f));
-    temp_sum += as_float(__builtin_amdgcn_readlane(as_uint(temp_sum), 32));
-#endif // Single or double precision
-
-    return temp_sum;
-}
-
-// Swizzle-based reduction; this will work on Sea Islands
-// This version will also put in a max-reduction for row_max_depth behind
-// the s_waitcnt instructions, making it faster than two sequential
-// reductions back-to-back.
-__device__
-FPTYPE swizzle_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth)
-{
-#ifdef USE_DOUBLE
-    typedef union dbl_b32 {
-        double val;
-        int2 b32;
-    } dbl_b32_t;
-    dbl_b32_t upper_sum, t_temp_sum;
-
-    t_temp_sum.val = temp_sum;
-    unsigned int max_depth = *row_max_depth;
-    unsigned int upper_max_depth;
-
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1)));
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x80b1);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x80b1);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e)));
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x804e);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x804e);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f)));
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x101f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x101f);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f)));
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x201f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x201f);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f)));
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x401f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x401f);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32)));
-    upper_sum.b32.x = __builtin_amdgcn_readlane(t_temp_sum.b32.x, 32);
-    upper_sum.b32.y = __builtin_amdgcn_readlane(t_temp_sum.b32.y, 32);
-    t_temp_sum.val += upper_sum.val;
-    temp_sum = t_temp_sum.val;
-#else // Swizzle-based for SPFP
-    unsigned int max_depth = *row_max_depth;
-
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x80b1));
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x804e));
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x101f));
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x201f));
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x401f));
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f)));
-    temp_sum += as_float(__builtin_amdgcn_readlane(as_uint(temp_sum), 32));
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32)));
-#endif // Single or double precision
-
-#ifndef SYNCFREE_KERNEL
-    *row_max_depth = max_depth;
-#endif
-    return temp_sum;
-}
-
-// Swizzle-based reduction; this will work on Sea Islands
-// This version will also put in a max-reduction for row_max_depth
-// add-reduction of the spin-loop counter behind the s_waitcnt
-// instructions, making it faster than two sequential reductions
-// back-to-back.
-__device__
-FPTYPE swizzle_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth, unsigned int *spin_times)
-{
-    unsigned int max_depth;
-    unsigned int spins;
-
-#ifdef USE_DOUBLE
-    typedef union dbl_b32 {
-        double val;
-        int2 b32;
-    } dbl_b32_t;
-    dbl_b32_t upper_sum, t_temp_sum;
-
-    t_temp_sum.val = temp_sum;
-    max_depth = *row_max_depth;
-    spins = *spin_times;
-
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1)));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x80b1);
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x80b1);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x80b1);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e)));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x804e);
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x804e);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x804e);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f)));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x101f);
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x101f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x101f);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f)));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x201f);
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x201f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x201f);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f)));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x401f);
-    upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x401f);
-    upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x401f);
-    t_temp_sum.val += upper_sum.val;
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32)));
-    spins += __builtin_amdgcn_readlane(spins, 32);
-    upper_sum.b32.x = __builtin_amdgcn_readlane(t_temp_sum.b32.x, 32);
-    upper_sum.b32.y = __builtin_amdgcn_readlane(t_temp_sum.b32.y, 32);
-    t_temp_sum.val += upper_sum.val;
-    temp_sum = t_temp_sum.val;
-
-#else // Swizzle-based for SPFP
-    max_depth = *row_max_depth;
-    spins = *spin_times;
-
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x80b1));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x80b1);
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x804e));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x804e);
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x101f));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x101f);
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x201f));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x201f);
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f)));
-    temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x401f));
-    spins += __builtin_amdgcn_ds_swizzle(spins, 0x401f);
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f)));
-    temp_sum += as_float(__builtin_amdgcn_readlane(as_uint(temp_sum), 32));
-    spins += __builtin_amdgcn_readlane(spins, 32);
-    max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32)));
-#endif // Single or double precision
-
-    *row_max_depth = max_depth;
-    *spin_times = spins;
-    return temp_sum;
-}
-*/
-
-// If we are in GCN3, then use DPP to further increase the performance of
-// inter-lane reduction of the temp_sum variable.
-__device__
-FPTYPE dpp_reduction(FPTYPE temp_sum)
-{
-    // If we write the EXEC mask before the DPP op, we need 5 stall cycles.
-    // So every one of these starts with an s_nop 4
-    // We require an s_nop 1 at the end in case the compiler immediately uses
-    // the last output value.
-#ifndef GCN2
-#ifdef USE_DOUBLE
-
-    typedef struct b32_2 {
-        int x;
-        int y;
-    } b32_t;
-
-    typedef union dbl_b32 {
-        double val;
-        b32_t b32;
-    } dbl_b32_t;
-    dbl_b32_t upper_sum, t_temp_sum;
-    t_temp_sum.val = temp_sum;
-    upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x111, 0xf, 0xf, 0); // row_shr:1
-    upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x111, 0xf, 0xf, 0);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x112, 0xf, 0xf, 0); // row_shr:2
-    upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x112, 0xf, 0xf, 0);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x114, 0xf, 0xe, 0); // row_shr:4 bank_mask:0xe
-    upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x114, 0xf, 0xe, 0);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x118, 0xf, 0xc, 0); // row_shr:8 bank_mask:0xc
-    upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x118, 0xf, 0xc, 0);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x142, 0xa, 0xf, 0); // row_bcast:15 row_mask:0xa
-    upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x142, 0xa, 0xf, 0);
-    t_temp_sum.val += upper_sum.val;
-    upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x143, 0xc, 0xf, 0); // row_bcast:31 row_maxk:0xc
-    upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x143, 0xc, 0xf, 0);
-    t_temp_sum.val += upper_sum.val;
-    return t_temp_sum.val;
-#else // USE_DOUBLE
-    __asm__ volatile ("s_nop 4\n"
-                      "v_add_f32 %0 %0 %0 row_shr:1 bound_ctrl:0\n"
-                      "s_nop 1\n"
-                      "v_add_f32 %0 %0 %0 row_shr:2 bound_ctrl:0\n"
-                      "s_nop 1\n"
-                      "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
-                      "s_nop 1\n"
-                      "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
-                      "s_nop 1\n"
-                      "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
-                      "s_nop 1\n"
-                      "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
-                      "s_nop 1"
-                      : "=v"(temp_sum)
-                      : "0"(temp_sum));
-    return temp_sum;
-#endif // Single vs. Double
-#else // We're in GCN2, so we will never enter this function
-    return temp_sum;
-#endif
-}
-
-// This version of the DPP reduction function also does a max-reduce on the
-// row_max_depth variable. It fits these DPP functions into one of the NOP
-// slots required by the DPP instructions, so it should be fast.
-__device__
-FPTYPE dpp_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth)
-{
-    // If we write the EXEC mask before the DPP op, we need 5 stall cycles.
-    // So every one of these starts with an s_nop 4
-    // We require an s_nop 1 at the end in case the compiler immediately uses
-    // the last output value.
-    unsigned int temp_max;
-#ifdef USE_DOUBLE
-    typedef struct b32_2 {
-        int x;
-        int y;
-    } b32_t;
-
-    typedef union dbl_b32 {
-        double val;
-        b32_t b32;
-    } dbl_b32_t;
-    dbl_b32_t upper_sum, t_temp_sum;
-    temp_max = *row_max_depth;
-    t_temp_sum.val = temp_sum;
-    __asm__ volatile ("s_nop 4\n"
-                      "v_mov_b32 %0 %4 row_shr:1 bound_ctrl:0\n"
-                      "v_mov_b32 %1 %5 row_shr:1 bound_ctrl:0\n"
-                      "v_max_u32 %2 %2 %2 row_shr:1 bound_ctrl:0\n"
-                      "s_nop 0\n"
-                      "v_add_f64 %3 %7 %8\n"
-                      "v_mov_b32 %0 %4 row_shr:2 bound_ctrl:0\n"
-                      "v_mov_b32 %1 %5 row_shr:2 bound_ctrl:0\n"
-                      "v_max_u32 %2 %2 %2 row_shr:2 bound_ctrl:0\n"
-                      "s_nop 0\n"
-                      "v_add_f64 %3 %7 %8\n"
-                      "v_mov_b32 %0 %4 row_shr:4 bank_mask:0xe\n"
-                      "v_mov_b32 %1 %5 row_shr:4 bank_mask:0xe\n"
-                      "v_max_u32 %2 %2 %2 row_shr:4 bank_mask:0xe\n"
-                      "s_nop 0\n"
-                      "v_add_f64 %3 %7 %8\n"
-                      "v_mov_b32 %0 %4 row_shr:8 bank_mask:0xc\n"
-                      "v_mov_b32 %1 %5 row_shr:8 bank_mask:0xc\n"
-                      "v_max_u32 %2 %2 %2 row_shr:8 bank_mask:0xc\n"
-                      "s_nop 0\n"
-                      "v_add_f64 %3 %7 %8\n"
-                      "v_mov_b32 %0 %4 row_bcast:15 bank_mask:0xa\n"
-                      "v_mov_b32 %1 %5 row_bcast:15 bank_mask:0xa\n"
-                      "v_max_u32 %2 %2 %2 row_bcast:15 bank_mask:0xa\n"
-                      "s_nop 0\n"
-                      "v_add_f64 %3 %7 %8\n"
-                      "v_mov_b32 %0 %4 row_bcast:31 row_mask:0xc\n"
-                      "v_mov_b32 %1 %5 row_bcast:31 bank_mask:0xc\n"
-                      "v_max_u32 %2 %2 %2 row_bcast:31 bank_mask:0xc\n"
-                      "s_nop 0\n"
-                      "v_add_f64 %3 %7 %8\n"
-                      : "={v2}"(upper_sum.b32.x), "={v3}"(upper_sum.b32.y), "=v"(temp_max),  "=v"(t_temp_sum.val)
-                      :  "v"(t_temp_sum.b32.x), "v"(t_temp_sum.b32.y), "2"(temp_max), "3"(t_temp_sum.val), "{v[2:3]}"(upper_sum.val));
-    *row_max_depth = temp_max;
-    return t_temp_sum.val;
-#else
-    temp_max = *row_max_depth;
-    __asm__ volatile ("s_nop 4\n"
-                      "v_add_f32 %0 %0 %0 row_shr:1 bound_ctrl:0\n"
-                      "v_max_u32 %1 %1 %1 row_shr:1 bound_ctrl:0\n"
-                      "s_nop 0\n"
-                      "v_add_f32 %0 %0 %0 row_shr:2 bound_ctrl:0\n"
-                      "v_max_u32 %1 %1 %1 row_shr:2 bound_ctrl:0\n"
-                      "s_nop 0\n"
-                      "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
-                      "v_max_u32 %1 %1 %1 row_shr:4 bank_mask:0xe\n"
-                      "s_nop 0\n"
-                      "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
-                      "v_max_u32 %1 %1 %1 row_shr:8 bank_mask:0xc\n"
-                      "s_nop 0\n"
-                      "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
-                      "v_max_u32 %1 %1 %1 row_bcast:15 row_mask:0xa\n"
-                      "s_nop 0\n"
-                      "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
-                      "v_max_u32 %1 %1 %1 row_bcast:31 row_mask:0xc\n"
-                      "s_nop 1\n"
-                      : "=v"(temp_sum), "=v"(temp_max)
-                      : "0"(temp_sum), "1"(temp_max));
-    *row_max_depth = temp_max;
-    return temp_sum;
-#endif // Single vs. Double
-}
-
-// This version of the DPP reduction function also does a max-reduce on the
-// row_max_depth variable and max-add on the total spin variable.
-// It fits these DPP functions into NOP slots required by the DPP
-// instructions, so it should be fast.
-__device__
-FPTYPE dpp_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth, unsigned int *spin_times)
-{
-    // If we write the EXEC mask before the DPP op, we need 5 stall cycles.
-    // So every one of these starts with an s_nop 4
-    // We require an s_nop 1 at the end in case the compiler immediately uses
-    // the last output value.
-    unsigned int temp_max = *row_max_depth;
-    unsigned int temp_spin = *spin_times;
-#ifdef USE_DOUBLE
-    typedef struct b32_2 {
-        int x;
-        int y;
-    } b32_t;
-
-    typedef union dbl_b32 {
-        double val;
-        b32_t b32;
-    } dbl_b32_t;
-    dbl_b32_t upper_sum, t_temp_sum;
-    temp_max = *row_max_depth;
-    t_temp_sum.val = temp_sum;
-    __asm__ volatile ("s_nop 4\n"
-                      "v_mov_b32 %0 %5 row_shr:1 bound_ctrl:0\n"
-                      "v_mov_b32 %1 %6 row_shr:1 bound_ctrl:0\n"
-                      "v_max_u32 %2 %2 %2 row_shr:1 bound_ctrl:0\n"
-                      "v_add_u32 %3 " VCC " %3 %3 row_shr:1 bound_ctrl:0\n"
-                      "v_add_f64 %4 %9 %10\n"
-                      "v_mov_b32 %0 %5 row_shr:2 bound_ctrl:0\n"
-                      "v_mov_b32 %1 %6 row_shr:2 bound_ctrl:0\n"
-                      "v_max_u32 %2 %2 %2 row_shr:2 bound_ctrl:0\n"
-                      "v_add_u32 %3 " VCC " %3 %3 row_shr:2 bound_ctrl:0\n"
-                      "v_add_f64 %4 %9 %10\n"
-                      "v_mov_b32 %0 %5 row_shr:4 bank_mask:0xe\n"
-                      "v_mov_b32 %1 %6 row_shr:4 bank_mask:0xe\n"
-                      "v_max_u32 %2 %2 %2 row_shr:4 bank_mask:0xe\n"
-                      "v_add_u32 %3 " VCC " %3 %3 row_shr:4 bank_mask:0xe\n"
-                      "v_add_f64 %4 %9 %10\n"
-                      "v_mov_b32 %0 %5 row_shr:8 bank_mask:0xc\n"
-                      "v_mov_b32 %1 %6 row_shr:8 bank_mask:0xc\n"
-                      "v_max_u32 %2 %2 %2 row_shr:8 bank_mask:0xc\n"
-                      "v_add_u32 %3 " VCC " %3 %3 row_shr:8 bank_mask:0xc\n"
-                      "v_add_f64 %4 %9 %10\n"
-                      "v_mov_b32 %0 %5 row_bcast:15 row_mask:0xa\n"
-                      "v_mov_b32 %1 %6 row_bcast:15 row_mask:0xa\n"
-                      "v_max_u32 %2 %2 %2 row_bcast:15 row_mask:0xa\n"
-                      "v_add_u32 %3 " VCC " %3 %3 row_bcast:15 row_mask:0xa\n"
-                      "v_add_f64 %4 %9 %10\n"
-                      "v_mov_b32 %0 %5 row_bcast:31 row_mask:0xc\n"
-                      "v_mov_b32 %1 %6 row_bcast:31 row_mask:0xc\n"
-                      "v_max_u32 %2 %2 %2 row_bcast:31 row_mask:0xc\n"
-                      "v_add_u32 %3 " VCC " %3 %3 row_bcast:31 row_mask:0xc\n"
-                      "v_add_f64 %4 %9 %10\n"
-                      "s_nop 0\n"
-                      : "={v2}"(upper_sum.b32.x), "={v3}"(upper_sum.b32.y), "=v"(temp_max), "=v"(temp_spin), "=v"(t_temp_sum.val)
-                      : "v"(t_temp_sum.b32.x), "v"(t_temp_sum.b32.y), "2"(temp_max), "3"(temp_spin), "4"(t_temp_sum.val), "{v[2:3]}"(upper_sum.val));
-    *row_max_depth = temp_max;
-    *spin_times = temp_spin;
-    return t_temp_sum.val;
-#else
-    __asm__ volatile ("s_nop 4\n"
-                      "v_add_f32 %0 %0 %0 row_shr:1 bound_ctrl:0\n"
-                      "v_max_u32 %1 %1 %1 row_shr:1 bound_ctrl:0\n"
-                      "v_add_u32 %2 " VCC " %2 %2 row_shr:1 bound_ctrl:0\n"
-                      "v_add_f32 %0 %0 %0 row_shr:2 bound_ctrl:0\n"
-                      "v_max_u32 %1 %1 %1 row_shr:2 bound_ctrl:0\n"
-                      "v_add_u32 %2 " VCC " %2 %2 row_shr:2 bound_ctrl:0\n"
-                      "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
-                      "v_max_u32 %1 %1 %1 row_shr:4 bank_mask:0xe\n"
-                      "v_add_u32 %2 " VCC " %2 %2 row_shr:4 bank_mask:0xe\n"
-                      "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
-                      "v_max_u32 %1 %1 %1 row_shr:8 bank_mask:0xc\n"
-                      "v_add_u32 %2 " VCC " %2 %2 row_shr:8 bank_mask:0xc\n"
-                      "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
-                      "v_max_u32 %1 %1 %1 row_bcast:15 row_mask:0xa\n"
-                      "v_add_u32 %2 " VCC " %2 %2 row_bcast:15\n"
-                      "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
-                      "v_max_u32 %1 %1 %1 row_bcast:31 row_mask:0xc\n"
-                      "v_add_u32 %2 " VCC " %2 %2 row_bcast:31\n"
-                      "s_nop 1"
-                      : "=v"(temp_sum), "=v"(temp_max), "=v"(temp_spin)
-                      : "0"(temp_sum), "1"(temp_max), "2"(temp_spin));
-    *row_max_depth = temp_max;
-    *spin_times = temp_spin;
-    return temp_sum;
-#endif // Single vs. Double
-}
-
-// Possible reduction techniques:
-//#define LDS_REDUCTION
-//#define BPERMUTE_REDUCTION
-//#define SWIZZLE_REDUCTION
-
-//#define DPP_REDUCTION
-
-#if defined(GCN2) && defined(DPP_REDUCTION)
-#define SWIZZLE_REDUCTION
-#undef DPP_REDUCTION
-#endif
-
-#ifdef DPP_REDUCTION
-    #define OUTPUT_THREAD WF_SIZE-1
-#else
-    #define OUTPUT_THREAD 0
-#endif
-
-__device__
-inline FPTYPE cross_lane_reduction(FPTYPE temp_sum, __shared__ FPTYPE *lds_ptr,
-        unsigned int start_of_this_row, unsigned int end_of_this_row,
-        unsigned int wg_lid)
-{
-#ifdef LDS_REDUCTION
-    FPTYPE temp_val = lds_reduction(temp_sum, lds_ptr, start_of_this_row,
-            end_of_this_row, wg_lid);
-    return temp_val;
-#endif
-
-#ifdef BPERMUTE_REDUCTION
-    return bpermute_reduction(temp_sum, start_of_this_row, end_of_this_row,
-            wg_lid);
-#endif
-
-#ifdef SWIZZLE_REDUCTION
-    return swizzle_reduction(temp_sum);
-#endif
-
-#ifdef DPP_REDUCTION
-    return dpp_reduction(temp_sum);
-#endif
-}
-
-__device__
-inline FPTYPE cross_lane_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth,
-        __shared__ FPTYPE *lds_ptr, __shared__ unsigned int *max_depth_ptr,
-        unsigned int start_of_this_row, unsigned int end_of_this_row,
-        unsigned int wg_lid)
-{
-#ifdef LDS_REDUCTION
-    FPTYPE temp_val = lds_reduction_two(temp_sum, *row_max_depth, lds_ptr,
-            max_depth_ptr, start_of_this_row, end_of_this_row, wg_lid);
-    *row_max_depth = max_depth_ptr[wg_lid & (~(WF_SIZE-1))];
-    return temp_val;
-#endif
-
-#ifdef BPERMUTE_REDUCTION
-    return bpermute_reduction_two(temp_sum, row_max_depth, start_of_this_row,
-            end_of_this_row, wg_lid);
-#endif
-
-#ifdef SWIZZLE_REDUCTION
-    return swizzle_reduction_two(temp_sum, row_max_depth);
-#endif
-
-#ifdef DPP_REDUCTION
-    return dpp_reduction_two(temp_sum, row_max_depth);
-#endif
-}
-
-__device__
-inline FPTYPE cross_lane_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth,
-        unsigned int *spin_times, __shared__ FPTYPE *lds_ptr,
-        __shared__ unsigned int *max_depth_ptr, __shared__ unsigned int *total_spins_ptr,
-        unsigned int start_of_this_row, unsigned int end_of_this_row,
-        unsigned int wg_lid)
-
-{
-
-#ifdef LDS_REDUCTION
-    FPTYPE temp_val = lds_reduction_three(temp_sum, *row_max_depth, *spin_times,
-            lds_ptr, max_depth_ptr, total_spins_ptr, start_of_this_row,
-            end_of_this_row, wg_lid);
-    *row_max_depth = max_depth_ptr[wg_lid & (~(WF_SIZE-1))];
-    *spin_times = total_spins_ptr[wg_lid & (~(WF_SIZE-1))];
-    return temp_val;
-#endif
-
-#ifdef BPERMUTE_REDUCTION
-    return bpermute_reduction_three(temp_sum, row_max_depth, spin_times,
-            start_of_this_row, end_of_this_row, wg_lid);
-#endif
-
-#ifdef SWIZZLE_REDUCTION
-    return swizzle_reduction_three(temp_sum, row_max_depth, spin_times);
-#endif
-
-#ifdef DPP_REDUCTION
-    return dpp_reduction_three(temp_sum, row_max_depth, spin_times);
-#endif
-
-	return temp_sum;
-}
-
-// The option below will, in the analyze and syncfree kernels, attempt to
-// spin-loop on flags in the LDS for rows that are being solved by wavefronts
-// earlier in the same workgroup. This should relieve global memory pressure.
-// We found that, with careful control of branching for this logic, this yields
-// an average of 20% better performance than global spin-looping.
-#define USE_LDS_SPINLOOP
-
-// The option below will, in the levelsync kernel, attempt to spin-loop on
-// flags in the LDS for rows that are being solved for wavefronts earlier in
-// the same workgroup. This is beneficial if levels have very few rows in them,
-// as workgroups are likely to have multiple levels and thus require spinning.
-// However, knowing what rows are in the LDS entry is more difficult for the
-// levelsync kernel, because it depends entirely on the rowMap entries being
-// used by these waves. As such, this loses performance when walking the row
-// map outweights the spin-loop benefits. As of this writing, the levelsync
-// LDS spin-loop is a net loser.
-// Leaving this around for future studies.
-// #define USE_LDS_SPINLOOP_LEVELSYNC
-
-
-// Solves for 'y' in the equation 'A * y = alpha * x'
-// In this kernel, we do not know what level each row is in. As such, we must
-// dynamically figure this out. Each row has the potential to require data from
-// a previous row. This happens when it has a non-zero in a column.
-// i.e. having a non-zero value in column $foo means you must wait for row $foo
-// to finish.
-//
-// The 'doneArray' has one entry per row. It starts out with each entry containing
-// zeroes. When a row finishes and its output written, it knows its own level
-// (which must be 1 more than the highest level of any row it relied on). As such,
-// it puts that level into the doneArray. If you must wait on a previous row, you
-// spinloop on that row's doneArray entry. Once it's non-zero, you know both that
-// the data is ready, as well as what level that value came from (so you can
-// calculate your own level).
-//
-// The doneArray can be used for future iterations, since the parllelism doesn't
-// change between iterations. As such, we keep the doneArray around and call
-// a different kernel that doesn't do the spin-loop waiting. To prep for that
-// kernel, we also need to know how many rows are at each level. Thus, when a
-// row finishes, it increments the numRowsAtLevel[] entry associated with its
-// level. Also we set the maxDepth variable to the maximum of any level seen.
-//__attribute__((reqd_work_group_size(WF_SIZE*WF_PER_WG, 1, 1)))
-//__kernel void
-__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
-amd_spts_analyze_and_solve(
-                            const size_t global_work_size,
-#ifdef USE_ROCSHMEM
-                            const int this_pe,
-                            const int total_pes,
-                            unsigned int * __restrict__ shadowDoneArray,
-                            unsigned int * __restrict__ reqUpdateArray,
-                            unsigned int * __restrict__ remoteInProgressArray,
-                            unsigned int * __restrict__ oneBuf,
-			    // 0: Naive puts
-			    // 1: Naive gets
-			    // 2: blocked puts
-			    // 3: put/get hybrid
-                            int rocshmem_algorithm,
-							int rocshmem_put_block_size,
-							int rocshmem_get_backoff_factor,
-                            int spts_block_size,
-#endif
-                            const FPTYPE * __restrict__ vals,
-                            const int * __restrict__ cols,
-                            const int * __restrict__ rowPtrs,
-                            const FPTYPE * __restrict__ vec_x,
-                            FPTYPE * __restrict__ out_y,
-                            const FPTYPE alpha,
-                            unsigned int * __restrict__ doneArray,
-                            unsigned int * __restrict__ numRowsAtLevel,
-                            unsigned int * __restrict__ maxDepth,
-                            unsigned long long * __restrict__ totalSpin)
-{
-    __shared__ FPTYPE *lds_ptr;
-    lds_ptr = nullptr;
-    __shared__ unsigned int *max_depth_ptr;
-    max_depth_ptr = nullptr;
-    __shared__ unsigned int *total_spins_ptr;
-    total_spins_ptr = nullptr;
-#ifdef LDS_REDUCTION
-    __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG];
-    lds_ptr = lds;
-#endif
-
-    // If we want future kernel iterations to skip the "wait on previous rows"
-    // work, we need to know what level set this row is in. This array is used
-    // to calculate the depth of each dependency so we can calculate max+1.
-#ifdef LDS_REDUCTION
-    __shared__ unsigned int max_depth[WF_SIZE*WF_PER_WG];
-    max_depth_ptr = max_depth;
-    __shared__ unsigned int total_spins[WF_SIZE*WF_PER_WG];
-    total_spins_ptr = total_spins;
-#endif // LDS_REDUCTION
-    unsigned int row_max_depth = 0;
-    unsigned int spin_times = 0;
-    const unsigned int wg_lid = hipThreadIdx_x;
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-#ifdef USE_ROCSHMEM
-    __shared__ rocshmem_ctx_t ctx;
-
-
-    //if (wg_lid == OUTPUT_THREAD) {
-    rocshmem_wg_init();
-    rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx);
-    __syncthreads();
-#endif
-
-    // Which wavefront within this workgroup
-    // also means which row within this workgroup's group of rows
-    const unsigned int local_offset = wg_lid / WF_SIZE;
-    // First row within this workgroup (within this group of rows)
-    const unsigned int local_first_row = hipBlockIdx_x * WF_PER_WG;
-    // Actual row this wavefront will work on.
-    const unsigned int local_row = local_first_row + local_offset;
-
-#ifdef USE_ROCSHMEM
-    // Get the global row for this wavefront assuming a row-cyclic
-    // decomposition.  Basically we need to account for other PEs here.
-    int local_block_id = local_row / spts_block_size;
-    const unsigned int block_offset = (local_block_id * spts_block_size * total_pes) +
-        (this_pe * spts_block_size);
-    const unsigned int row = block_offset + (local_row % spts_block_size);
-    const unsigned int first_row = block_offset + (local_first_row % spts_block_size);
-#else
-    const unsigned int row = local_row;
-    const unsigned int first_row = local_first_row;
-#endif
-
-    __shared__ FPTYPE diagonal[WF_PER_WG];
-
-#ifdef USE_LDS_SPINLOOP
-    // If we are trying to access an output that was produced by a wavefront
-    // earlier in this workgroup, perform the transfer and spin-loop in LDS
-    // to relieve global memory pressure.
-    __shared__ unsigned int localDoneArray[WF_PER_WG];
-    __shared__ FPTYPE localOutY[WF_PER_WG];
-    __syncthreads();
-
-    if (global_work_size > (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)) {
-
-    if (lid == 0)
-    {
-        localDoneArray[local_offset] = 0;
-        localOutY[local_offset] = 0.;
-    }
-#else
-    if (global_work_size > (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)) {
-#endif
-
-    FPTYPE temp_sum = 0.;
-    // Preload the first thread with alpha * x. We can bring this forward
-    // because the 'x' vector in A*y=alpha*x is fixed and known already.
-    // From this point on, we will subtract out values from rows of X from
-    // alpha*x, and that will allow us to solve for entries of y.
-    // Hauling this up to the top of the kernel increases performance because
-    // it removes the memory load and multiply from the critical path of
-    // "previous rows' inputs are ready, finish this and allow further rows
-    // to start up as fast as possible."
-    if (lid == OUTPUT_THREAD)
-        temp_sum = alpha * vec_x[row];
-
-    unsigned int start_of_this_row = rowPtrs[row];
-    unsigned int end_of_this_row = rowPtrs[row+1];
-    unsigned int start_point = start_of_this_row+lid;
-
-
-    // This wavefront operates on a single row, from its beginning to end.
-    for(unsigned int j = start_point; j < end_of_this_row; j+=WF_SIZE)
-    {
-
-        FPTYPE out_val;
-        unsigned int local_done = 0;
-        // Replace the two loads below with inline assembly that sets the
-        // SLC bit. This forces the loads to essentially bypass the L2
-        // to increase cache hit rate on other instructions. Vals and cols
-        // are basically streamed in, so caching them doesn't help much.
-
-        // local_col will tell us, for this iteration of the above for loop
-        // (i.e. for this entry in this row), which columns contain the
-        // non-zero values. We must then ensure that the output from the row
-        // associated with the local_col is complete to ensure that we can
-        // calculate the right answer.
-        int local_col = __builtin_nontemporal_load(&cols[j]);
-        // Haul loading from vals[] up near the load of cols[] so that we get
-        // good coalsced loads.
-        FPTYPE local_val = __builtin_nontemporal_load(&vals[j]);
-
-        // diagonal. Skip this, we need to solve for it.
-        if (local_col == row)
-        {
-            local_done = 1;
-            diagonal[local_offset] = local_val;
-	    local_val = 0.; // Make the out_val multiply below do nothing.
-        }
-
-        // While there are threads in this workgroup that have been unable to
-        // get their input, loop and wait for the flag to exist.
-        __asm__ volatile ("s_setprio 0");
-#ifdef USE_ROCSHMEM
-        int target_pe = (local_col / spts_block_size) % total_pes;
-        int backoff_counter = 0;
-        bool need_remote_notify = true;
-		bool need_comm = true;
-        bool first_time = true;
-
-#endif
-
-#ifdef USE_LDS_SPINLOOP
-	if (local_col >= first_row)
-	{
-            while (!local_done)
-            {
-                // Check in the LDS if the value was produced by someone
-                // within this workgroup.
-                local_done = localDoneArray[local_col - first_row];
-                out_val = localOutY[local_col - first_row];
-        	asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-	    }
-        }
-#endif // USE_LDS_SPINLOOP
-	while (!local_done)
-        {
-            // Replace this atomic with an assembly load with GLC bit set.
-            // This forces the load to go to the coherence point, allowing
-            // us to avoid deadlocks.
-            // local_done = atomic_get_done(doneArray, local_col);
-            __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n"
-                "s_waitcnt vmcnt(0)"
-                : "=v"(local_done)
-                : "v"(&doneArray[local_col]));
-
-            spin_times++;
-
-#ifdef USE_ROCSHMEM
-            if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 1)) {
-				if (first_time) {
-                    if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0)
-                        need_comm = false;
-                }
-				first_time = false;
-				if (need_comm)
-					{
-                    for (int i = 0; i < (backoff_counter * rocshmem_get_backoff_factor); i++)
-                        __asm__ volatile("s_sleep 127");
-
-
-                    rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
-		        	//rocshmem_ctx_quiet(ctx);
-
-                	__asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n"
-                    	"s_waitcnt vmcnt(0)"
-                    	: "=v"(local_done)
-                    	: "v"(&shadowDoneArray[local_col]));
-
-
-                	if (local_done)
-                	{
-                        rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
-
-                    	__asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP
-		        			:
-                        	: "v"(&doneArray[local_col]),
-                          	"v"(local_done));
-                		} else {
-                    		backoff_counter++;
-
-             			}
-
-            	}
-			}
-
-            if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 3)) {
-                if (need_remote_notify) {
-                    need_remote_notify = false;
-                    //if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0)
-                    //if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) == 0)
-		            {
-                        rocshmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe);
-					   //printf("Put 111 blockIDx %d threadID %d target_pe  %d   local_col %d  oneBuf[0]= %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, oneBuf[0]);
-
-                        rocshmem_ctx_fence(ctx);
-					   //printf("fence 222 blockIDx %d threadID %d target_pe  %d   local_col %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col);
-                        rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe);
-                        rocshmem_ctx_quiet(ctx);
-					   //printf("Get 333  blockIDx %d threadID %d target_pe  %d   local_col %d shadowDone %d \n \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, shadowDoneArray[local_col]);
-
-                        __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n"
-                            "s_waitcnt vmcnt(0)"
-                            : "=v"(local_done)
-                            : "v"(&shadowDoneArray[local_col]));
-
-                        if (local_done)
-                        {
-                            rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe);
-			    			rocshmem_ctx_quiet(ctx);
-                            __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP
-                                    :
-                                    : "v"(&doneArray[local_col]),
-                                    "v"(local_done));
-                        }
-                    }
-	            }
-            }
-#endif
-        }
-
-        __asm__ volatile ("s_setprio 1");
-#ifdef USE_LDS_SPINLOOP
-        if (local_col < first_row)
-#endif
-        {
-            // The command below is manually replaced with GCN assembly with
-            // the GLC bit set. This bypasses the L1, allowing us to do a
-            // coherent load of the variable without needing atomics.
-#ifdef USE_DOUBLE
-            // out_val = as_double(atom_or((__global ulong *)&(out_y[local_col]), 0));
-            __asm__ volatile (MEM_PREFIX"_load_dwordx2 %0 %1 " OFF_MODIFIER " glc\n"
-                "s_waitcnt vmcnt(0)"
-                : "=v"(out_val)
-                : "v"(&out_y[local_col]));
-#else
-            // out_val = as_float(atomic_or((__global uint *)&(out_y[local_col]), 0));
-            __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc\n"
-                "s_waitcnt vmcnt(0)"
-                : "=v"(out_val)
-                : "v"(&out_y[local_col]));
-#endif
-        }
-        temp_sum -= local_val * out_val;
-
-        row_max_depth = max(local_done, row_max_depth);
-    }
-    __asm__ volatile ("s_setprio 1");
-
-    // And if we care about the maximum depth, add it into OUTPUT_THREAD's
-    // entry within the max_depth array.
-    temp_sum = cross_lane_reduction_three(temp_sum, &row_max_depth, &spin_times,
-            lds_ptr, max_depth_ptr, total_spins_ptr, start_of_this_row,
-            end_of_this_row, wg_lid);
-    row_max_depth++;
-
-    // y = (x-sum_of_vals_from_A) / diag
-    if (lid == OUTPUT_THREAD)
-    {
-#ifndef LDS_REDUCTION
-        // Wait for local memory to quiesce for the diagonal
-        // LDS_REDUCTION has such waits in it already.
-        asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-#endif
-        FPTYPE out_val = temp_sum / diagonal[local_offset];
-        //out_y[row] = out_val;
-
-#ifdef USE_DOUBLE
-        __asm__ volatile (MEM_PREFIX"_store_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val));
-#else
-        __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val));
-#endif
-
-        //out_y[row] = temp_sum / diagonal[local_offset]; // original divide
-#ifdef USE_LDS_SPINLOOP
-        localOutY[row - first_row] = out_val;
-        localDoneArray[row - first_row] = row_max_depth;
-#endif // USE_LDS_SPINLOOP
-        //doneArray[row] = row_max_depth;
-        __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(row_max_depth));
-        asm volatile ("s_waitcnt vmcnt(0)\n\t");
-
-#ifdef USE_ROCSHMEM
-    if (rocshmem_algorithm == 2 && total_pes > 1) {
-        int CHUNK = rocshmem_put_block_size;
-        bool sendTime = true;
-        int row_base = (row / CHUNK) * CHUNK;
-        int num_done = atomicAdd(&shadowDoneArray[row_base], 1);
-        sendTime = (num_done == (CHUNK - 1));
-        for(int p=0; p<total_pes; p++){
-            if(p != this_pe && sendTime){
-                rocshmem_ctx_putmem_nbi(ctx, &out_y[row_base], &out_y[row_base], sizeof(FPTYPE) * CHUNK, p);
-                rocshmem_ctx_fence(ctx);
-                rocshmem_ctx_putmem_nbi(ctx, &doneArray[row_base], &doneArray[row_base], sizeof(int) * CHUNK, p);
-                rocshmem_ctx_quiet(ctx);
-            }
-        }
-    }
-
-	if (rocshmem_algorithm == 0) {
-        for(int p=0; p<total_pes; p++){
-            if(p != this_pe){
-                rocshmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
-                rocshmem_ctx_fence(ctx);
-                rocshmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
-            }
-        }
-	}
-
-	if (rocshmem_algorithm == 3) {
-	    // Only broadcast update if another node explicitly registered for this row.  TODO:
-	    // Make 2D array to scale
-	    unsigned int need_broadcast;
-        __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\ns_waitcnt vmcnt(0)" : "=v"(need_broadcast) : "v"(&reqUpdateArray[row]));
-
-	    if (need_broadcast == 1) {
-            for(int p=0; p<total_pes; p++) {
-                if (p != this_pe) {
-                    rocshmem_ctx_putmem_nbi(ctx, &out_y[row], &out_y[row], sizeof(FPTYPE), p);
-		    		rocshmem_ctx_fence(ctx);
-                    rocshmem_ctx_putmem_nbi(ctx, &doneArray[row], &doneArray[row], sizeof(int), p);
-                }
-            }
-	    }
-	}
-#endif
-
-        // Must atomic these next two, since other WGs are doing the same thing
-        // We're sending out "row_max_depth-1" because of 0-based indexing.
-        // However, we needed to put a non-zero value into the doneArray up above
-        // when we crammed row_max_depth in, so these two will be off by one.
-        atomicAdd(&numRowsAtLevel[row_max_depth-1], 1);
-        atomicMax(maxDepth, row_max_depth);
-        atomicAdd(totalSpin, spin_times);
-        // If you add this back in after doing a native_divide up above,
-        // we can get *some* of the accuracy of a full Newton-Raphson
-        // divide while maintaining the performance of the
-        // native_divide() on the critical path.
-        //out_y[row] = temp_sum / diagonal[local_offset];
-    }
-    }
-
-    #ifdef USE_ROCSHMEM
-    __syncthreads();
-    //if (wg_lid == OUTPUT_THREAD)
-    rocshmem_wg_ctx_destroy(ctx);
-    rocshmem_wg_finalize();
-    #endif
-}
-
-// Solves for 'y' in the equation 'A * y = alpha * x'
-// In this kernel, we do not know what level each row is in. As such, we must
-// dynamically figure this out. Each row has the potential to require data from
-// a previous row. This happens when it has a non-zero in a column.
-// i.e. having a non-zero value in column $foo means you must wait for row $foo
-// to finish.
-//
-// The 'doneArray' has one entry per row. It starts out with each entry containing
-// zeroes. When a row finishes and its output written, it knows its own level
-// (which must be 1 more than the highest level of any row it relied on). As such,
-// it puts that level into the doneArray. If you must wait on a previous row, you
-// spinloop on that row's doneArray entry. Once it's non-zero, you know both that
-// the data is ready, as well as what level that value came from (so you can
-// calculate your own level).
-//
-// The doneArray can be used for future iterations, since the parllelism doesn't
-// change between iterations. As such, we keep the doneArray around and call
-// a different kernel that doesn't do the spin-loop waiting. To prep for that
-// kernel, we also need to know how many rows are at each level. Thus, when a
-// row finishes, it increments the numRowsAtLevel[] entry associated with its
-// level. Also we set the maxDepth variable to the maximum of any level seen.
-//__attribute__((reqd_work_group_size(WF_SIZE*WF_PER_WG, 1, 1)))
-//__kernel void
-__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
-amd_spts_syncfree_solve(
-                            size_t global_work_size,
-                            const FPTYPE * __restrict__ vals,
-                            const int * __restrict__ cols,
-                            const int * __restrict__ rowPtrs,
-                            const FPTYPE * __restrict__ vec_x,
-                            FPTYPE * __restrict__ out_y,
-                            const FPTYPE alpha,
-                            unsigned int * __restrict__ doneArray,
-                            unsigned int * __restrict__ numRowsAtLevel,
-                            unsigned int * __restrict__ maxDepth,
-                            unsigned long long * __restrict__ totalSpin)
-{
-    if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return;
-    __shared__ FPTYPE *lds_ptr;
-    lds_ptr = nullptr;
-    __shared__ unsigned int *max_depth_ptr;
-    max_depth_ptr = nullptr;
-    __shared__ unsigned int *total_spins_ptr;
-    total_spins_ptr = nullptr;
-#ifdef LDS_REDUCTION
-    __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG];
-    lds_ptr = lds;
-#endif
-
-    const unsigned int wg_lid = hipThreadIdx_x;
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-    // Which wavefront within this workgroup
-    // also means which row within this workgroup's group of rows
-    const unsigned int local_offset = wg_lid / WF_SIZE;
-    // First row within this workgroup (within this group of rows)
-    const unsigned int first_row = hipBlockIdx_x * WF_PER_WG;
-    // Actual row this wavefront will work on.
-    const unsigned int row = first_row + local_offset;
-
-    __shared__ FPTYPE diagonal[WF_PER_WG];
-
-#ifdef USE_LDS_SPINLOOP
-    // If we are trying to access an output that was produced by a wavefront
-    // earlier in this workgroup, perform the transfer and spin-loop in LDS
-    // to relieve global memory pressure.
-    __shared__ unsigned int localDoneArray[WF_PER_WG];
-    __shared__ FPTYPE localOutY[WF_PER_WG];
-#endif
-
-    FPTYPE temp_sum = 0.;
-    // Preload the first thread with alpha * x. We can bring this forward
-    // because the 'x' vector in A*y=alpha*x is fixed and known already.
-    // From this point on, we will subtract out values from rows of X from
-    // alpha*x, and that will allow us to solve for entries of y.
-    // Hauling this up to the top of the kernel increases performance because
-    // it removes the memory load and multiply from the critical path of
-    // "previous rows' inputs are ready, finish this and allow further rows
-    // to start up as fast as possible."
-    if (lid == OUTPUT_THREAD)
-        temp_sum = alpha * vec_x[row];
-
-    unsigned int start_of_this_row = rowPtrs[row];
-    unsigned int end_of_this_row = rowPtrs[row+1];
-    unsigned int start_point = start_of_this_row+lid;
-    // This wavefront operates on a single row, from its beginning to end.
-
-    for(unsigned int j = start_point; j < end_of_this_row; j+=WF_SIZE)
-    {
-#ifdef USE_LDS_SPINLOOP
-        if (lid == 0)
-        {
-            localDoneArray[local_offset] = 0;
-            localOutY[local_offset] = 0.;
-        }
-#endif
-
-        // local_col will tell us, for this iteration of the above for loop
-        // (i.e. for this entry in this row), which columns contain the
-        // non-zero values. We must then ensure that the output from the row
-        // associated with the local_col is complete to ensure that we can
-        // calculate the right answer.
-        int local_col = -1;
-        // Haul loading from vals[] up near the load of cols[] so that we get
-        // good coalsced loads.
-        FPTYPE local_val = 0.;
-        unsigned int local_done = 0;
-
-        // Replace the two loads below with inline assembly that sets the
-        // SLC bit. This forces the loads to essentially bypass the L2
-        // to increase cache hit rate on other instructions. Vals and cols
-        // are basically streamed in, so caching them doesn't help much.
-        // local_col = cols[j];
-        // local_val = vals[j];
-#ifdef USE_DOUBLE
-	__asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dwordx2 %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#else
-	__asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dword %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#endif
-
-        // diagonal. Skip this, we need to solve for it.
-        if (local_col == row)
-        {
-            local_done = 1;
-            diagonal[local_offset] = local_val;
-        }
-
-        // While there are threads in this workgroup that have been unable to
-        // get their input, loop and wait for the flag to exist.
-        __asm__ volatile ("s_setprio 0");
-        while (!local_done)
-        {
-#ifdef USE_LDS_SPINLOOP
-            if (local_col >= first_row)
-            {
-                // Check in the LDS if the value was produced by someone
-                // within this workgroup.
-                local_done = localDoneArray[local_col - first_row];
-                asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-            }
-            else
-#endif // USE_LDS_SPINLOOP
-            {
-                // Replace this atomic with an assembly load with GLC bit set.
-                // This forces the load to go to the coherence point, allowing
-                // us to avoid deadlocks.
-                // local_done = atomic_get_done(doneArray, local_col);
-                __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\ns_waitcnt vmcnt(0)" : "=v"(local_done) : "v"(&doneArray[local_col]));
-            }
-            if (local_done)
-            {
-                FPTYPE out_val;
-                __asm__ volatile ("s_setprio 1");
-#ifdef USE_LDS_SPINLOOP
-                if (local_col >= first_row)
-                {
-                    out_val = localOutY[local_col - first_row];
-                    asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-                }
-                else
-#endif // USE_LDS_SPINLOOP
-                {
-                    // The command below is manually replaced with GCN assembly with
-                    // the GLC bit set. This bypasses the L1, allowing us to do a
-                    // coherent load of the variable without needing atomics.
-#ifdef USE_DOUBLE
-                    // out_val = as_double(atom_or((__global ulong *)&(out_y[local_col]), 0));
-                    __asm__ volatile (MEM_PREFIX"_load_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col]));
-#else
-                    // out_val = as_float(atomic_or((__global uint *)&(out_y[local_col]), 0));
-                    __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col]));
-#endif
-                }
-                temp_sum -= local_val * out_val;
-
-            }
-            else
-            {
-                (void)0;
-            }
-        }
-    }
-    __asm__ volatile ("s_setprio 1");
-
-    // Take all of the temp_sum values and add them together into
-    // OUTPUT_THREAD's temp_sum value.
-    temp_sum = cross_lane_reduction(temp_sum, lds_ptr, start_of_this_row,
-            end_of_this_row, wg_lid);
-
-    // y = (x-sum_of_vals_from_A) / diag
-    if (lid == OUTPUT_THREAD)
-    {
-#ifndef LDS_REDUCTION
-        // Wait for local memory to quiesce for the diagonal
-        // LDS_REDUCTION has such waits in it already.
-        asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-#endif
-        FPTYPE out_val = temp_sum / diagonal[local_offset];
-        //out_y[row] = out_val;
-#ifdef USE_DOUBLE
-        __asm__ volatile (MEM_PREFIX"_store_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val));
-#else
-        __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val));
-#endif
-        //out_y[row] = temp_sum / diagonal[local_offset]; // original divide
-        int set_one = 1;
-#ifdef USE_LDS_SPINLOOP
-        localDoneArray[row - first_row] = 1;
-        localOutY[row - first_row] = out_val;
-#endif // USE_LDS_SPINLOOP
-        //doneArray[row] = 1;
-        __asm__ volatile (MEM_PREFIX"_store_byte %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(set_one));
-        // If you add this back in after doing a native_divide up above,
-        // we can get *some* of the accuracy of a full Newton-Raphson
-        // divide while maintaining the performance of the
-        // native_divide() on the critical path.
-        //out_y[row] = temp_sum / diagonal[local_offset];
-    }
-}
-
-// Solves for 'y' in the equation 'A * y = alpha * x'
-// In this kernel, every row is in the same level. As such, we can freely
-// have every workgrup complete at its own pace.
-// However, we must call this kernel multiple times, once per level.
-//
-// The rowMap tells us that, in this level, gid X works on row Y.
-// We need this because each level of the solve can have different numbers
-// of non-contiguous row. This version of our solver uses one kernel call
-// per level.
-//
-// In addition, the 'total_rows_in_prev_levels' tells us how far in that array
-// to look.
-__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
-amd_spts_levelset_solve(
-                      size_t global_work_size,
-                      const FPTYPE * __restrict__  vals,
-                      const int * __restrict__  cols,
-                      const int * __restrict__  rowPtrs,
-                      const FPTYPE * __restrict__  vec_x,
-                      FPTYPE * __restrict__  out_y,
-                      const unsigned int * __restrict__  rowMap,
-                      const unsigned int total_rows_in_prev_levels,
-                      const FPTYPE alpha)
-{
-    if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return;
-    __shared__ FPTYPE *lds_ptr;
-    lds_ptr = nullptr;
-#ifdef LDS_REDUCTION
-    __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG];
-    lds_ptr = lds;
-#endif
-
-    // Which wavefront within this workgroup
-    // also means which row within this workgroup's group of rows
-    const unsigned int local_offset = hipThreadIdx_x / WF_SIZE;
-    // First row within this workgroup (within this group of rows)
-    const unsigned int first_row = hipBlockIdx_x * WF_PER_WG;
-
-    const unsigned int wg_lid = hipThreadIdx_x;
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-    const unsigned int row = rowMap[total_rows_in_prev_levels+first_row+local_offset];
-
-    __shared__ FPTYPE diagonal[WF_PER_WG];
-    FPTYPE temp_sum = 0.;
-
-    // Preload the first thread with alpha * x. We can bring this forward
-    // because the 'x' vector in A*y=alpha*x is fixed and known already.
-    // From this point on, we will subtract out values from rows of X from
-    // alpha*x, and that will allow us to solve for entries of y.
-    if (lid == OUTPUT_THREAD)
-        temp_sum = alpha * vec_x[row];
-
-    unsigned int start_of_this_row = rowPtrs[row];
-    unsigned int end_of_this_row = rowPtrs[row+1];
-    unsigned int start_point = start_of_this_row+lid;
-
-    // This workgroup operates on a single row, from its beginning to end.
-    for(unsigned int j = start_point; j < end_of_this_row; j+=WF_SIZE)
-    {
-        // local_col will tell us, for this iteration of the above for loop
-        // (i.e. for this entry in this row), which columns contain the
-        // non-zero values. We must then ensure that the output from the row
-        // associated with the local_col is complete to ensure that we can
-        // calculate the right answer.
-        int local_col = -1;
-        // Haul loading from vals[] up near the load of cols[] so that we get
-        // good coalsced loads.
-        FPTYPE local_val = 0.;
-
-        // Replace the two loads below with inline assembly that sets the
-        // SLC bit. This forces the loads to essentially bypass the L2
-        // to increase cache hit rate on other instructions. Vals and cols
-        // are basically streamed in, so caching them doesn't help much.
-        // local_col = cols[j];
-        // local_val = vals[j];
-#ifdef USE_DOUBLE
-        __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dwordx2 %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#else
-        __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dword %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#endif
-
-        // diagonal. Skip this, we need to solve for it.
-        if (local_col == row)
-            diagonal[local_offset] = local_val;
-        else
-        {
-            FPTYPE out_val = out_y[local_col];
-            temp_sum -= local_val * out_val;
-        }
-    }
-    // Take all of the temp_sum values and add them together into
-    // OUTPUT_THREAD's temp_sum value.
-    temp_sum = cross_lane_reduction(temp_sum, lds_ptr,
-            start_of_this_row, end_of_this_row, wg_lid);
-
-    // y = (x-sum_of_vals_from_A) / diag
-    if (lid == OUTPUT_THREAD)
-    {
-#ifndef LDS_REDUCTION
-        // Wait for local memory to quiesce for the diagonal
-        // LDS_REDUCTION has such waits in it already.
-        asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-#endif
-        out_y[row] = temp_sum / diagonal[local_offset]; // original divide
-        //out_y[row] = temp_sum / diagonal[local_offset]; // original divide
-    }
-}
-
-// Solves for 'y' in the equation 'A * y = alpha * x'
-// This kernel will only work if we launch a single workgroup that will
-// solve multiple levels in a serial fashion. For each level, every thread
-// within that level will try to solve for a different row.
-// After solving for this level, the single workgroup hits a workgroup-wide
-// barrier instruction waiting for all the other rows in this level to
-// complete.
-//
-// We can only solve up to 1024 rows in a single level call right now,
-// because each thread will solve a single row per level.
-//
-// This is a "CSR-Scalar" style analysis, where each thread is accessing
-// a potentially very different area of both the CSR matrix and the vector.
-// Performance may be bad, but this is very easy to write.
-//
-// The rowMap tells us that, within a level, thread X works on row Y.
-// We need this because each level of the solve can have different numbers
-// of non-contiguous row.
-// In addition, the 'total_rows_in_prev_levels' tells us how far in that array
-// to look.
-//
-// [start_level, end_level) tell us which entries in the rowMap we will go
-// through in this kernel invocation.
-__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
-amd_spts_scalar_solve(
-                      size_t global_work_size,
-                      const FPTYPE * __restrict__  vals,
-                      const int * __restrict__  cols,
-                      const int * __restrict__  rowPtrs,
-                      const FPTYPE * __restrict__  vec_x,
-                      FPTYPE * __restrict__  out_y,
-                      const FPTYPE alpha,
-                      const unsigned int * __restrict__  rowMap,
-                      const unsigned int * __restrict__  totalRowsInEachLevel,
-                      const unsigned int total_rows_in_prev_levels,
-                      const unsigned int start_level,
-                      const unsigned int end_level)
-{
-    if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return;
-    const unsigned int gid = hipBlockIdx_x;
-    const unsigned int wg_lid = hipThreadIdx_x;
-    const unsigned int lid = wg_lid % WF_SIZE;
-
-    __shared__ unsigned int total_rows_seen_so_far;
-    if (wg_lid == 0)
-        total_rows_seen_so_far = 0;
-
-    // We have a single workgroup, and it is going to walk through a
-    // contiguous set of "levels" in the dependency graph.
-    for (unsigned int current_level = start_level; current_level < end_level; current_level++)
-    {
-        // Every time we reach a new level, all of the threads within
-        // this workgroup need to have completed their row's work.
-        // This guarantees that we have synchronized.
-        __syncthreads();
-        if (wg_lid < totalRowsInEachLevel[current_level])
-        {
-            const unsigned int entry_in_row_map = total_rows_in_prev_levels + total_rows_seen_so_far + wg_lid;
-            const unsigned int row = rowMap[entry_in_row_map];
-            FPTYPE diagonal = 0.;
-            FPTYPE temp_sum = alpha * vec_x[row];
-
-            unsigned int start_of_this_row = rowPtrs[row];
-            unsigned int end_of_this_row = rowPtrs[row+1];
-
-            // This thread operates on a single row, from its beginning to end.
-            for(unsigned int j = start_of_this_row; j < end_of_this_row; j++)
-            {
-                // local_col will tell us, for this iteration of the above for loop
-                // (i.e. for this entry in this row), which columns contain the
-                // non-zero values. We must then ensure that the output from the row
-                // associated with the local_col is complete to ensure that we can
-                // calculate the right answer.
-                int local_col = cols[j];
-                // Haul loading from vals[] up near the load of cols[] so that we get
-                // good coalsced loads.
-                FPTYPE local_val = vals[j];
-
-                // diagonal. Skip this, we need to solve for it.
-                if (local_col == row)
-                    diagonal = local_val;
-                else
-                {
-                    FPTYPE out_val;
-#ifdef USE_DOUBLE
-                    out_val = __ull2double_rd(atomicOr((unsigned long long *)&(out_y[local_col]), 0));
-#else
-                    out_val = as_float(atomicOr((uint *)&(out_y[local_col]), 0));
-#endif
-                    temp_sum -= local_val * out_val;
-                }
-            }
-
-            FPTYPE out_val = temp_sum / diagonal;
-            //FPTYPE out_val = temp_sum / diagonal; // original divide
-            out_y[row] = out_val;
-        }
-        if (wg_lid == 0)
-            total_rows_seen_so_far += totalRowsInEachLevel[current_level];
-    }
-}
-
-// Solves for 'y' in the equation 'A * y = alpha * x'
-// This kernel will only work if we launch a single workgroup that will
-// solve multiple levels in a serial fashion. For each level, every wavefront
-// within that level will try to solve for a different row.
-// After solving for this level, the single workgroup hits a workgroup-wide
-// barrier instruction waiting for all the other rows in this level to
-// complete.
-//
-// Within a level, this algorithm will loop through the rows, so we should
-// be able to handle levels of any size -- no synchronization is needed
-// between the wavefronts working on a single level, since those rows are
-// independent of one another.
-//
-// This is a "CSR-Vector" style execution, where each wavefront accesses
-// coalesced values within its row, but where short rows waste thread
-// resources.
-//
-// The rowMap tells us that, within a level, thread X works on row Y.
-// We need this because each level of the solve can have different numbers
-// of non-contiguous row.
-// In addition, the 'total_rows_in_prev_levels' tells us how far in that array
-// to look.
-//
-// [start_level, end_level) tell us which entries in the rowMap we will go
-// through in this kernel invocation.
-__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
-amd_spts_vector_solve(
-                      size_t global_work_size,
-                      const FPTYPE * __restrict__  vals,
-                      const int * __restrict__  cols,
-                      const int * __restrict__  rowPtrs,
-                      const FPTYPE * __restrict__  vec_x,
-                      FPTYPE *  out_y,
-                      const FPTYPE alpha,
-                      const unsigned int * __restrict__  rowMap,
-                      const unsigned int * __restrict__  totalRowsInEachLevel,
-                      const unsigned int total_rows_in_prev_levels,
-                      const unsigned int start_level,
-                      const unsigned int end_level)
-{
-    if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return;
-    __shared__ FPTYPE *lds_ptr;
-    lds_ptr = nullptr;
-#ifdef LDS_REDUCTION
-    __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG];
-    lds_ptr = lds;
-#endif
-    __shared__ FPTYPE diagonal[WF_PER_WG];
-
-    // First row within this workgroup (within this group of rows)
-    const unsigned int first_row = hipBlockIdx_x * WF_PER_WG;
-
-    const unsigned int gid = hipBlockIdx_x;
-    const unsigned int wg_lid = hipThreadIdx_x;
-    const unsigned int lid = wg_lid % WF_SIZE;
-    const unsigned int wf_id = wg_lid / WF_SIZE;
-
-    unsigned int cur_loc_row = wf_id;
-
-    unsigned int total_rows_seen_so_far = 0;
-
-    // We have a single workgroup, and it is going to walk through a
-    // contiguous set of "levels" in the dependency graph.
-    for (unsigned int current_level = start_level; current_level < end_level; current_level++)
-    {
-        // Every time we reach a new level, all of the wavefronts within
-        // this workgroup need to have completed their row's work.
-        // This guarantees that we have synchronized.
-        __syncthreads();
-        for (unsigned int cur_loc_row = wf_id; cur_loc_row < totalRowsInEachLevel[current_level]; cur_loc_row += WF_PER_WG)
-        {
-            const unsigned int entry_in_row_map = total_rows_in_prev_levels + total_rows_seen_so_far + cur_loc_row;
-            const unsigned int row = rowMap[entry_in_row_map];
-            FPTYPE temp_sum = 0.;
-
-            if (lid == OUTPUT_THREAD)
-                temp_sum = alpha * vec_x[row];
-
-            unsigned int start_of_this_row = rowPtrs[row];
-            unsigned int end_of_this_row = rowPtrs[row+1];
-
-            // This thread operates on a single row, from its beginning to end.
-            for(unsigned int j = start_of_this_row + lid; j < end_of_this_row; j += WF_SIZE)
-            {
-                // local_col will tell us, for this iteration of the above for loop
-                // (i.e. for this entry in this row), which columns contain the
-                // non-zero values. We must then ensure that the output from the row
-                // associated with the local_col is complete to ensure that we can
-                // calculate the right answer.
-                int local_col = -1;
-                // Haul loading from vals[] up near the load of cols[] so that we get
-                // good coalsced loads.
-                FPTYPE local_val = 0.;
-
-                // Replace the two loads below with inline assembly that sets the
-                // SLC bit. This forces the loads to essentially bypass the L2
-                // to increase cache hit rate on other instructions. Vals and cols
-                // are basically streamed in, so caching them doesn't help much.
-                //local_col = cols[j];
-                //local_val = vals[j];
-#ifdef USE_DOUBLE
-                __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dwordx2 %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#else
-                __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dword %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#endif
-
-                // diagonal. Skip this, we need to solve for it.
-                if (local_col == row)
-                    diagonal[wf_id] = local_val;
-                else
-                {
-                    FPTYPE out_val;
-                    out_val = out_y[local_col];
-                    temp_sum -= local_val * out_val;
-                }
-            }
-
-            // Take all of the temp_sum values and add them together into
-            // OUTPUT_THREAD's temp_sum value.
-            temp_sum = cross_lane_reduction(temp_sum, lds_ptr,
-                    start_of_this_row, end_of_this_row, wg_lid);
-
-            // y = (x-sum_of_vals_from_A) / diag
-            if (lid == OUTPUT_THREAD)
-            {
-#ifndef LDS_REDUCTION
-                // Wait for local memory to quiesce for the diagonal
-                // LDS_REDUCTION has such waits in it already.
-                asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-#endif
-                FPTYPE out_val = temp_sum / diagonal[wf_id];
-                //FPTYPE out_val = temp_sum / diagonal[wf_id]; // original divide
-                out_y[row] = out_val;
-            }
-        }
-        total_rows_seen_so_far += totalRowsInEachLevel[current_level];
-    }
-}
-
-// Solves for 'y' in the equation 'A * y = alpha * x'
-// This kernel is a simplified modification of the synchronization-free kernel.
-// However, it is set up to work on rows that are in a contiguous series of
-// levels. As such, this must be run after the initial analysis phase has
-// produced a row map.
-//
-// Within a level, this kernel can use multiple workgroups to work on many
-// rows simultaneously. In addition, multiple levels can be in flight at once,
-// and this algorithm will use the synchronization-free spin-looping to produce
-// the correct answer.
-//
-// However, we may not want to use *just* the synchronization-free spin-looping
-// approach on all rows at the same time, as many rows deep in the dependency
-// graph may just end up waiting, and spinning, for a long time. This spinning
-// can slow down everyone else. As such, we partially break the dependency graph
-// into multiple kernel invocations. This slightly reduces the theoretical
-// parallelism, but it can make some invocations much faster due to less noise.
-//
-// The rowMap tells us that, within a level, thread X works on row Y.
-// We need this because each level of the solve can have different numbers
-// of non-contiguous row.
-// In addition, the 'total_rows_in_prev_levels' tells us how far in that array
-// to look, since previous kernel launches completed some previous rows.
-__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1)
-amd_spts_levelsync_solve(
-                      size_t global_work_size,
-                      const FPTYPE * __restrict__  vals,
-                      const int * __restrict__  cols,
-                      const int * __restrict__  rowPtrs,
-                      const FPTYPE * __restrict__  vec_x,
-                      FPTYPE * __restrict__  out_y,
-                      const FPTYPE alpha,
-                      unsigned int * __restrict__  doneArray,
-                      const unsigned int * __restrict__  rowMap,
-                      const unsigned int total_rows_in_prev_levels)
-{
-    if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return;
-    __shared__ FPTYPE *lds_ptr;
-    lds_ptr = nullptr;
-#ifdef LDS_REDUCTION
-    __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG];
-    lds_ptr = lds;
-#endif
-    __shared__ FPTYPE diagonal[WF_PER_WG];
-
-    const unsigned int gid = hipBlockIdx_x;
-    const unsigned int wg_lid = hipThreadIdx_x;
-    const unsigned int lid = wg_lid % WF_SIZE;
-    const unsigned int wf_id = wg_lid / WF_SIZE;
-
-    const unsigned int row = rowMap[total_rows_in_prev_levels + (gid * WF_PER_WG) + wf_id];
-
-    FPTYPE temp_sum = 0.;
-
-    if (lid == OUTPUT_THREAD)
-        temp_sum = alpha * vec_x[row];
-    unsigned int start_of_this_row = rowPtrs[row];
-    unsigned int end_of_this_row = rowPtrs[row+1];
-    unsigned int start_point = start_of_this_row+lid;
-
-    // This wavefront operates on a single row, from its beginning to end.
-    for(unsigned int j = start_point; j < end_of_this_row; j+=WF_SIZE)
-    {
-        // local_col will tell us, for this iteration of the above for loop
-        // (i.e. for this entry in this row), which columns contain the
-        // non-zero values. We must then ensure that the output from the row
-        // associated with the local_col is complete to ensure that we can
-        // calculate the right answer.
-        int local_col = -1;
-        // Haul loading from vals[] up near the load of cols[] so that we get
-        // good coalsced loads.
-        FPTYPE local_val = 0.;
-        unsigned int local_done = 0;
-
-        // Replace the two loads below with inline assembly that sets the
-        // SLC bit. This forces the loads to essentially bypass the L2
-        // to increase cache hit rate on other instructions. Vals and cols
-        // are basically streamed in, so caching them doesn't help much.
-        // local_col = cols[j];
-        // local_val = vals[j];
-#ifdef USE_DOUBLE
-        __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dwordx2 %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#else
-        __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dword %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j]));
-#endif
-
-        // diagonal. Skip this, we need to solve for it.
-        if (local_col == row)
-        {
-            local_done = 1;
-            diagonal[wf_id] = local_val;
-        }
-
-        // While there are threads in this workgroup that have been unable to
-        // get their input, loop and wait for the flag to exist.
-        __asm__ volatile ("s_setprio 0");
-        while (!local_done)
-        {
-            {
-                // Replace this atomic with an assembly load with GLC bit set.
-                // This forces the load to go to the coherence point, allowing
-                // us to avoid deadlocks.
-                // local_done = atomic_get_done(doneArray, local_col);
-                __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\ns_waitcnt vmcnt(0)" : "=v"(local_done) : "v"(&doneArray[local_col]));
-            }
-            if (local_done)
-            {
-                FPTYPE out_val;
-                __asm__ volatile ("s_setprio 1");
-                // The command below is manually replaced with GCN assembly with
-                // the GLC bit set. This bypasses the L1, allowing us to do a
-                // coherent load of the variable without needing atomics.
-#ifdef USE_DOUBLE
-                // out_val = as_double(atom_or((__global ulong *)&(out_y[local_col]), 0));
-                __asm__ volatile (MEM_PREFIX"_load_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col]));
-#else
-                // out_val = as_float(atomic_or((__global uint *)&(out_y[local_col]), 0));
-                __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col]));
-#endif
-                temp_sum -= local_val * out_val;
-            }
-        }
-    }
-    __asm__ volatile ("s_setprio 1");
-    // Take all of the temp_sum values and add them together into
-    // OUTPUT_THREAD's temp_sum value.
-    temp_sum = cross_lane_reduction(temp_sum, lds_ptr, start_of_this_row,
-            end_of_this_row, wg_lid);
-    // y = (x-sum_of_vals_from_A) / diag
-    if (lid == OUTPUT_THREAD)
-    {
-#ifndef LDS_REDUCTION
-        // Wait for local memory to quiesce for the diagonal
-        // LDS_REDUCTION has such waits in it already.
-        asm volatile ("s_waitcnt lgkmcnt(0)\n\t");
-#endif
-        FPTYPE out_val = temp_sum / diagonal[wf_id];
-        //out_y[row] = out_val;
-#ifdef USE_DOUBLE
-        __asm__ volatile (MEM_PREFIX"_store_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val));
-#else
-        __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val));
-#endif
-        //out_y[row] = temp_sum / diagonal[wf_id]; // original divide
-        int set_one = 1;
-        //doneArray[row] = 1;
-        __asm__ volatile (MEM_PREFIX"_store_byte %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(set_one));
-        // If you add this back in after doing a native_divide up above,
-        // we can get *some* of the accuracy of a full Newton-Raphson
-        // divide while maintaining the performance of the
-        // native_divide() on the critical path.
-        //out_y[row] = temp_sum / diagonal[wf_id];
-    }
-}
diff --git a/projects/rocshmem/internal/continuous_integration/compile/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/compile/Jenkinsfile
deleted file mode 100644
index 426a2004f8..0000000000
--- a/projects/rocshmem/internal/continuous_integration/compile/Jenkinsfile
+++ /dev/null
@@ -1,118 +0,0 @@
-pipeline {
-    agent { label 'sv-pdp-5' }
-    environment {
-        HSA_FORCE_FINE_GRAIN_PCIE = 1
-        MPI_HOME="/home/resperf/mpich-4.0.1/install/global"
-        PATH = "$MPI_HOME/bin:$PATH"
-        LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH"
-        build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}"
-        CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake"
-    }
-    stages {
-        stage('Synchronize Source Code') {
-            steps {
-                checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]]
-            }
-        }
-        stage('Make Build Directory') {
-            steps {
-                dir("library") {
-                    sh "mkdir -p ${build_dir}"
-                }
-            }
-        }
-        stage('Build Source Code') {
-            parallel {
-                stage('RC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi_wf_coal install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/projects/rocshmem/internal/continuous_integration/compile/compile_config.xml b/projects/rocshmem/internal/continuous_integration/compile/compile_config.xml
deleted file mode 100644
index ccaf58b58a..0000000000
--- a/projects/rocshmem/internal/continuous_integration/compile/compile_config.xml
+++ /dev/null
@@ -1,93 +0,0 @@
-<?xml version='1.1' encoding='UTF-8'?>
-<flow-definition plugin="workflow-job@2.40">
-  <actions>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobAction plugin="pipeline-model-definition@1.8.4"/>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction plugin="pipeline-model-definition@1.8.4">
-      <jobProperties/>
-      <triggers/>
-      <parameters/>
-      <options/>
-    </org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction>
-  </actions>
-  <description></description>
-  <keepDependencies>false</keepDependencies>
-  <properties>
-    <org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-      <triggers>
-        <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.GerritTrigger plugin="gerrit-trigger@2.33.0">
-          <spec></spec>
-          <gerritProjects>
-            <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.GerritProject>
-              <compareType>PLAIN</compareType>
-              <pattern>rsch/ec/shmem</pattern>
-              <branches>
-                <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.Branch>
-                  <compareType>PLAIN</compareType>
-                  <pattern>amd-master</pattern>
-                </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.Branch>
-              </branches>
-              <disableStrictForbiddenFileVerification>false</disableStrictForbiddenFileVerification>
-            </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.GerritProject>
-          </gerritProjects>
-          <dynamicGerritProjects class="empty-list"/>
-          <skipVote>
-            <onSuccessful>true</onSuccessful>
-            <onFailed>true</onFailed>
-            <onUnstable>true</onUnstable>
-            <onNotBuilt>true</onNotBuilt>
-            <onAborted>true</onAborted>
-          </skipVote>
-          <silentMode>false</silentMode>
-          <enableTopicAssociation>false</enableTopicAssociation>
-          <notificationLevel></notificationLevel>
-          <silentStartMode>false</silentStartMode>
-          <escapeQuotes>true</escapeQuotes>
-          <nameAndEmailParameterMode>PLAIN</nameAndEmailParameterMode>
-          <dependencyJobsNames></dependencyJobsNames>
-          <commitMessageParameterMode>BASE64</commitMessageParameterMode>
-          <changeSubjectParameterMode>PLAIN</changeSubjectParameterMode>
-          <commentTextParameterMode>BASE64</commentTextParameterMode>
-          <buildStartMessage></buildStartMessage>
-          <buildFailureMessage></buildFailureMessage>
-          <buildSuccessfulMessage></buildSuccessfulMessage>
-          <buildUnstableMessage></buildUnstableMessage>
-          <buildNotBuiltMessage></buildNotBuiltMessage>
-          <buildAbortedMessage></buildAbortedMessage>
-          <buildUnsuccessfulFilepath></buildUnsuccessfulFilepath>
-          <customUrl></customUrl>
-          <serverName>amd-gerrit</serverName>
-          <triggerOnEvents>
-            <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.events.PluginCommentAddedContainsEvent>
-              <commentAddedCommentContains>!COMPILE</commentAddedCommentContains>
-            </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.events.PluginCommentAddedContainsEvent>
-          </triggerOnEvents>
-          <dynamicTriggerConfiguration>false</dynamicTriggerConfiguration>
-          <triggerConfigURL></triggerConfigURL>
-          <triggerInformationAction/>
-        </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.GerritTrigger>
-      </triggers>
-    </org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-  </properties>
-  <definition class="org.jenkinsci.plugins.workflow.cps.CpsScmFlowDefinition" plugin="workflow-cps@2.90">
-    <scm class="hudson.plugins.git.GitSCM" plugin="git@4.7.1">
-      <configVersion>2</configVersion>
-      <userRemoteConfigs>
-        <hudson.plugins.git.UserRemoteConfig>
-          <url>ssh://gerritgit/rsch/ec/shmem</url>
-        </hudson.plugins.git.UserRemoteConfig>
-      </userRemoteConfigs>
-      <branches>
-        <hudson.plugins.git.BranchSpec>
-          <name>FETCH_HEAD</name>
-        </hudson.plugins.git.BranchSpec>
-      </branches>
-      <doGenerateSubmoduleConfigurations>false</doGenerateSubmoduleConfigurations>
-      <submoduleCfg class="empty-list"/>
-      <extensions/>
-    </scm>
-    <scriptPath>internal/continuous_integration/compile/Jenkinsfile</scriptPath>
-    <lightweight>false</lightweight>
-  </definition>
-  <triggers/>
-  <disabled>false</disabled>
-</flow-definition>
\ No newline at end of file
diff --git a/projects/rocshmem/internal/continuous_integration/gdb/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/gdb/Jenkinsfile
deleted file mode 100644
index 48106a3a5b..0000000000
--- a/projects/rocshmem/internal/continuous_integration/gdb/Jenkinsfile
+++ /dev/null
@@ -1,221 +0,0 @@
-pipeline {
-    agent { label 'sv-pdp-5' }
-    environment {
-        HSA_FORCE_FINE_GRAIN_PCIE = 1
-        MPI_HOME="/home/resperf/mpich-4.0.1/install/global"
-        PATH = "$MPI_HOME/bin:$PATH"
-        LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH"
-        build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}"
-        CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake"
-    }
-    stages {
-        stage('Synchronize Source Code') {
-            steps {
-                checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]]
-            }
-        }
-        stage('Env Variables') {
-            steps {
-                sh 'printenv'
-            }
-        }
-        stage('Make Build Directory') {
-            steps {
-                dir("library") {
-                    sh "mkdir -p ${build_dir}"
-                }
-            }
-        }
-        stage('Build Source Code') {
-
-            failFast true
-
-            parallel {
-                stage('RC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/RC_SINGLE") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi_wf_coal install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/RC_MULTI") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        //}
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/DC_SINGLE") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        //}
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/DC_MULTI") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        //}
-                    }
-                }
-            }
-        }
-        stage('Run Tests') {
-            stages {
-                stage('RC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE true'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL true'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI true'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI'
-                        //}
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE true'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE'
-                        //}
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI true'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI'
-                        //}
-                    }
-                }
-
-                stage('RO_NET_BASIC') {
-                    // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
-                            sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC true'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
-                        //    sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
-                        //}
-                    }
-                }
-            }
-        }
-        stage('Generate Checker Metadata') {
-            steps {
-                dir("library/${build_dir}") {
-                    sh 'git fetch --tags'
-                    sh 'git log --pretty=oneline remotes/origin/amd-master.. > changeset_delta.txt'
-                    sh 'git log --pretty=oneline remotes/origin/amd-master~1..remotes/origin/amd-master >> changeset_delta.txt'
-                }
-            }
-        }
-        stage('Archive Artifacts') {
-            steps {
-                dir("library/${build_dir}") {
-                    archiveArtifacts artifacts: 'changeset_delta.txt'
-                }
-                dir("clients/functional_tests/${build_dir}") {
-                    archiveArtifacts artifacts: 'RC_SINGLE/**/*.log'
-                    archiveArtifacts artifacts: 'RC_MULTI/**/*.log'
-                    archiveArtifacts artifacts: 'DC_SINGLE/**/*.log'
-                    archiveArtifacts artifacts: 'DC_MULTI/**/*.log'
-                    archiveArtifacts artifacts: 'RO_NET_BASIC/**/*.log'
-                }
-            }
-        }
-    }
-}
diff --git a/projects/rocshmem/internal/continuous_integration/long/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/long/Jenkinsfile
deleted file mode 100644
index 77c3420784..0000000000
--- a/projects/rocshmem/internal/continuous_integration/long/Jenkinsfile
+++ /dev/null
@@ -1,413 +0,0 @@
-pipeline {
-    agent { label 'sv-pdp-5' }
-    environment {
-        build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}"
-
-        MPI_HOME="/home/resperf/mpich-4.0.1/install/global"
-        UCX_HOME="/home/resperf/ucx/install"
-
-        PATH="$MPI_HOME/bin:$UCX_HOME/bin:$PATH"
-        LD_LIBRARY_PATH="$MPI_HOME/lib:$UCX_HOME/lib:$LD_LIBRARY_PATH"
-        PKG_CONFIG_PATH="$MPI_HOME/lib/pkgconfig:$UCX_HOME/lib/pkgconfig"
-
-        CMAKE_PREFIX_PATH="/opt/rocm/lib/cmake"
-
-        UCX_WARN_UNUSED_ENV_VARS="n"
-        HSA_FORCE_FINE_GRAIN_PCIE=1
-    }
-    stages {
-        stage('Synchronize Source Code') {
-            steps {
-                checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]]
-            }
-        }
-        stage('Env Variables') {
-            steps {
-                sh 'printenv'
-            }
-        }
-        stage('Make Build Directory') {
-            steps {
-                dir("library") {
-                    sh "mkdir -p ${build_dir}"
-                }
-            }
-        }
-        stage('Build Source Code') {
-
-            failFast true
-
-            parallel {
-                stage('RC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi_wf_coal install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_DEBUG') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE_DEBUG") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single_debug install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE_DEBUG") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_SINGLE_DEBUG") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_SINGLE_DEBUG") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_PROFILE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE_PROFILE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single_profile install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE_PROFILE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_SINGLE_PROFILE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_SINGLE_PROFILE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install'
-                        }
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_IPC') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI_IPC") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi_ipc install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI_IPC") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/DC_MULTI_IPC") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI_IPC") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_DEBUG') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI_DEBUG") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi_debug install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI_DEBUG") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/DC_MULTI_DEBUG") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI_DEBUG") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_PROFILE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI_PROFILE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi_profile install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI_PROFILE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/DC_MULTI_PROFILE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI_PROFILE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install'
-                        }
-                    }
-                }
-            }
-        }
-        stage('Run Tests') {
-            stages {
-                stage('RC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE all ${build_dir}/RC_SINGLE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL all ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_DEBUG') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_DEBUG'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG all ${build_dir}/RC_SINGLE_DEBUG'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/spts single_thread ${build_dir}/RC_SINGLE_DEBUG'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_PROFILE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_PROFILE'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE all ${build_dir}/RC_SINGLE_PROFILE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/spts single_thread ${build_dir}/RC_SINGLE_PROFILE'
-                        }
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE all ${build_dir}/DC_SINGLE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI all ${build_dir}/DC_MULTI'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_IPC') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_IPC/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_IPC'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_IPC all ${build_dir}/DC_MULTI_IPC'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_IPC/spts multi_thread ${build_dir}/DC_MULTI_IPC'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_DEBUG') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_DEBUG'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_DEBUG all ${build_dir}/DC_MULTI_DEBUG'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/spts multi_thread ${build_dir}/DC_MULTI_DEBUG'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_PROFILE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_PROFILE'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_PROFILE all ${build_dir}/DC_MULTI_PROFILE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/spts multi_thread ${build_dir}/DC_MULTI_PROFILE'
-                        }
-                    }
-                }
-
-                stage('RO_NET_BASIC') {
-                    // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
-                            sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
-                        }
-                        dir("clients/sos_tests") {
-                            sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI'
-                        }
-                        dir("internal/clients/spts") {
-                            sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
-                            sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/projects/rocshmem/internal/continuous_integration/long/long_config.xml b/projects/rocshmem/internal/continuous_integration/long/long_config.xml
deleted file mode 100644
index aa49b2d399..0000000000
--- a/projects/rocshmem/internal/continuous_integration/long/long_config.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version='1.1' encoding='UTF-8'?>
-<flow-definition plugin="workflow-job@2.40">
-  <actions>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobAction plugin="pipeline-model-definition@1.8.4"/>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction plugin="pipeline-model-definition@1.8.4">
-      <jobProperties/>
-      <triggers/>
-      <parameters/>
-      <options/>
-    </org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction>
-  </actions>
-  <description></description>
-  <keepDependencies>false</keepDependencies>
-  <properties>
-    <org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-      <triggers>
-        <hudson.triggers.TimerTrigger>
-          <spec>H 22 * * *</spec>
-        </hudson.triggers.TimerTrigger>
-      </triggers>
-    </org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-  </properties>
-  <definition class="org.jenkinsci.plugins.workflow.cps.CpsScmFlowDefinition" plugin="workflow-cps@2.90">
-    <scm class="hudson.plugins.git.GitSCM" plugin="git@4.7.1">
-      <configVersion>2</configVersion>
-      <userRemoteConfigs>
-        <hudson.plugins.git.UserRemoteConfig>
-          <url>ssh://gerritgit/rsch/ec/shmem</url>
-        </hudson.plugins.git.UserRemoteConfig>
-      </userRemoteConfigs>
-      <branches>
-        <hudson.plugins.git.BranchSpec>
-          <name>*/amd-master</name>
-        </hudson.plugins.git.BranchSpec>
-      </branches>
-      <doGenerateSubmoduleConfigurations>false</doGenerateSubmoduleConfigurations>
-      <submoduleCfg class="empty-list"/>
-      <extensions/>
-    </scm>
-    <scriptPath>internal/continuous_integration/nightly/Jenkinsfile</scriptPath>
-    <lightweight>false</lightweight>
-  </definition>
-  <triggers/>
-  <disabled>false</disabled>
-</flow-definition>
\ No newline at end of file
diff --git a/projects/rocshmem/internal/continuous_integration/nightly/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/nightly/Jenkinsfile
deleted file mode 100644
index e4e8e0284a..0000000000
--- a/projects/rocshmem/internal/continuous_integration/nightly/Jenkinsfile
+++ /dev/null
@@ -1,335 +0,0 @@
-pipeline {
-    agent { label 'sv-pdp-5' }
-    environment {
-        HSA_FORCE_FINE_GRAIN_PCIE = 1
-        MPI_HOME="/home/resperf/mpich-4.0.1/install/global"
-        PATH = "$MPI_HOME/bin:$PATH"
-        LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH"
-        build_dir = "builds/${BUILD_ID}"
-        CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake"
-    }
-    stages {
-        stage('Synchronize Source Code') {
-            steps {
-                git branch: 'amd-master', changelog: false, poll: false, url: 'ssh://gerritgit/rsch/ec/shmem'
-            }
-        }
-        stage('Make Build Directory') {
-            steps {
-                dir("library") {
-                    sh "mkdir -p ${build_dir}"
-                }
-            }
-        }
-        stage('Build Source Code') {
-            parallel {
-                stage('RC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi_wf_coal install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_DEBUG') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE_DEBUG") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single_debug install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE_DEBUG") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_SINGLE_DEBUG") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_PROFILE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE_PROFILE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single_profile install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE_PROFILE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/RC_SINGLE_PROFILE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install'
-                        }
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_IPC') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI_IPC") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi_ipc install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI_IPC") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI_IPC") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_DEBUG') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI_DEBUG") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi_debug install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI_DEBUG") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI_DEBUG") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_PROFILE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI_PROFILE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi_profile install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI_PROFILE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install'
-                        }
-                        //===================== SPTS ==========================
-                        dir("internal/clients/spts/${build_dir}/DC_MULTI_PROFILE") {
-                            sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install'
-                        }
-                    }
-                }
-            }
-        }
-        stage('Run Tests') {
-            stages {
-                stage('RC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_DEBUG') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_DEBUG'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/spts single_thread ${build_dir}/RC_SINGLE_DEBUG'
-                        }
-                    }
-                }
-
-                stage('RC_SINGLE_PROFILE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_PROFILE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/spts single_thread ${build_dir}/RC_SINGLE_PROFILE'
-                        }
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_IPC') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_IPC/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_IPC'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_IPC/spts multi_thread ${build_dir}/DC_MULTI_IPC'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_DEBUG') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_DEBUG'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/spts multi_thread ${build_dir}/DC_MULTI_DEBUG'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI_PROFILE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_PROFILE'
-                        }
-                        dir("internal/clients/spts") {
-                            sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/spts multi_thread ${build_dir}/DC_MULTI_PROFILE'
-                        }
-                    }
-                }
-
-                stage('RO_NET_BASIC') {
-                    // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
-                            sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
-                        }
-                        dir("internal/clients/spts") {
-                            sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
-                            sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC'
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/projects/rocshmem/internal/continuous_integration/nightly/nightly_config.xml b/projects/rocshmem/internal/continuous_integration/nightly/nightly_config.xml
deleted file mode 100644
index aa49b2d399..0000000000
--- a/projects/rocshmem/internal/continuous_integration/nightly/nightly_config.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version='1.1' encoding='UTF-8'?>
-<flow-definition plugin="workflow-job@2.40">
-  <actions>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobAction plugin="pipeline-model-definition@1.8.4"/>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction plugin="pipeline-model-definition@1.8.4">
-      <jobProperties/>
-      <triggers/>
-      <parameters/>
-      <options/>
-    </org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction>
-  </actions>
-  <description></description>
-  <keepDependencies>false</keepDependencies>
-  <properties>
-    <org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-      <triggers>
-        <hudson.triggers.TimerTrigger>
-          <spec>H 22 * * *</spec>
-        </hudson.triggers.TimerTrigger>
-      </triggers>
-    </org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-  </properties>
-  <definition class="org.jenkinsci.plugins.workflow.cps.CpsScmFlowDefinition" plugin="workflow-cps@2.90">
-    <scm class="hudson.plugins.git.GitSCM" plugin="git@4.7.1">
-      <configVersion>2</configVersion>
-      <userRemoteConfigs>
-        <hudson.plugins.git.UserRemoteConfig>
-          <url>ssh://gerritgit/rsch/ec/shmem</url>
-        </hudson.plugins.git.UserRemoteConfig>
-      </userRemoteConfigs>
-      <branches>
-        <hudson.plugins.git.BranchSpec>
-          <name>*/amd-master</name>
-        </hudson.plugins.git.BranchSpec>
-      </branches>
-      <doGenerateSubmoduleConfigurations>false</doGenerateSubmoduleConfigurations>
-      <submoduleCfg class="empty-list"/>
-      <extensions/>
-    </scm>
-    <scriptPath>internal/continuous_integration/nightly/Jenkinsfile</scriptPath>
-    <lightweight>false</lightweight>
-  </definition>
-  <triggers/>
-  <disabled>false</disabled>
-</flow-definition>
\ No newline at end of file
diff --git a/projects/rocshmem/internal/continuous_integration/short/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/short/Jenkinsfile
deleted file mode 100644
index 56b22d05d5..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/Jenkinsfile
+++ /dev/null
@@ -1,288 +0,0 @@
-pipeline {
-    agent { label 'sv-pdp-7' }
-    environment {
-        build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}"
-
-        MPI_HOME="/home/resperf/mpich/install"
-        UCX_HOME="/home/resperf/ucx/install"
-
-        PATH="$MPI_HOME/bin:$UCX_HOME/bin:$PATH"
-        LD_LIBRARY_PATH="$MPI_HOME/lib:$UCX_HOME/lib:$LD_LIBRARY_PATH"
-        PKG_CONFIG_PATH="$MPI_HOME/lib/pkgconfig:$UCX_HOME/lib/pkgconfig"
-
-        CMAKE_PREFIX_PATH="/opt/rocm/lib/cmake"
-
-        UCX_WARN_UNUSED_ENV_VARS="n"
-        HSA_FORCE_FINE_GRAIN_PCIE=1
-        UCX_TLS="rc"
-        ROCSHMEM_USE_SQ_GPU_MEM=0
-        ROCSHMEM_USE_CQ_GPU_MEM=0
-        ROCSHMEM_NUM_BLOCKS=128
-    }
-    stages {
-        stage('Synchronize Source Code') {
-            steps {
-                checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]]
-            }
-        }
-        stage('Env Variables') {
-            steps {
-                sh 'printenv'
-            }
-        }
-        stage('Make Build Directory') {
-            steps {
-                dir("library") {
-                    sh "mkdir -p ${build_dir}"
-                }
-            }
-        }
-        stage('Build Source Code') {
-
-            failFast true
-
-            parallel {
-                stage('RC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/RC_SINGLE") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi_wf_coal install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/RC_MULTI") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        //}
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/DC_SINGLE") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        //}
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/DC_MULTI") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        //}
-                    }
-                }
-
-                stage('RO_NET') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RO_NET") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/ro_net install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RO_NET") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RO_NET/install'
-                        }
-                        dir("clients/sos_tests/${build_dir}/RO_NET") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RO_NET/install'
-                        }
-                        //===================== SPTS ==========================
-                        //dir("internal/clients/spts/${build_dir}/RO_NET") {
-                        //    sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RO_NET/install'
-                        //}
-                    }
-                }
-            }
-        }
-        stage('Run Tests') {
-            stages {
-                stage('RC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE short ${build_dir}/RC_SINGLE'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL short ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        //}
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI short ${build_dir}/RC_MULTI'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI'
-                        //}
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE short ${build_dir}/DC_SINGLE'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE'
-                        //}
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI'
-                        }
-                        dir("clients/sos_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI short ${build_dir}/DC_MULTI'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI'
-                        //}
-                    }
-                }
-
-                stage('RO_NET') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/rocshmem_example_driver ro ${build_dir}/RO_NET'
-                        }
-                        dir("clients/sos_tests") {
-                            sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RO_NET short ${build_dir}/RO_NET'
-                        }
-                        //dir("internal/clients/spts") {
-                        //    sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/spts multi_thread ${build_dir}/RO_NET'
-                        //}
-                    }
-                }
-            }
-        }
-        stage('Generate Checker Metadata') {
-            steps {
-                dir("library/${build_dir}") {
-                    sh 'git fetch --tags'
-                    sh 'git log --pretty=oneline remotes/origin/amd-master.. > changeset_delta.txt'
-                    sh 'git log --pretty=oneline remotes/origin/amd-master~1..remotes/origin/amd-master >> changeset_delta.txt'
-                }
-            }
-        }
-        stage('Archive Artifacts') {
-            steps {
-                dir("library/${build_dir}") {
-                    archiveArtifacts artifacts: 'changeset_delta.txt'
-                }
-                dir("clients/functional_tests/${build_dir}") {
-                    archiveArtifacts artifacts: 'RC_SINGLE/**/*.log'
-                    archiveArtifacts artifacts: 'RC_MULTI/**/*.log'
-                    archiveArtifacts artifacts: 'DC_SINGLE/**/*.log'
-                    archiveArtifacts artifacts: 'DC_MULTI/**/*.log'
-                    archiveArtifacts artifacts: 'RO_NET/**/*.log'
-                }
-            }
-        }
-    }
-    post {
-        success {
-            build job: 'shmem_perf_check', wait: true
-        }
-    }
-}
diff --git a/projects/rocshmem/internal/continuous_integration/short/__init__.py b/projects/rocshmem/internal/continuous_integration/short/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/projects/rocshmem/internal/continuous_integration/short/absolute_path.py b/projects/rocshmem/internal/continuous_integration/short/absolute_path.py
deleted file mode 100644
index 7d11a7ce44..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/absolute_path.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import glob
-import pprint
-
-class PathGlobber():
-    def __init__(self, name, *partial_paths_to_concatenate):
-        self._search_path = ''
-        for partial_path in partial_paths_to_concatenate:
-            self._search_path += partial_path
-        self.dirs = []
-        self._name = name
-
-    def generate(self):
-        self.dirs = glob.glob(self._search_path, recursive=True)
-
-    def dump(self):
-        str_out = self._name
-        str_out += pprint.pformat(self.dirs, width=120)
-        str_out += '\n'
-        return str_out
diff --git a/projects/rocshmem/internal/continuous_integration/short/archive_path.py b/projects/rocshmem/internal/continuous_integration/short/archive_path.py
deleted file mode 100644
index 1df07aa631..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/archive_path.py
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import absolute_path
-import glob
-
-class Archive(absolute_path.PathGlobber):
-    def __init__(self, args, name=''):
-        archive_path = args.archive_path
-        super().__init__(name, args.jenkins_path, archive_path,
-                         args.benchmark_path)
-
-    def path_of_build(self, build_id):
-        path = self._search_path.replace('*/archive', build_id + '/archive')
-        path = glob.glob(path)
-        return path[0]
diff --git a/projects/rocshmem/internal/continuous_integration/short/check_perf_delta.py b/projects/rocshmem/internal/continuous_integration/short/check_perf_delta.py
deleted file mode 100755
index be09e8cab4..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/check_perf_delta.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import parser
-import dictionary
-import archive_path
-import checker
-
-def main():
-    # This script accepts command line values, but has reasonable defaults
-    # needed to run as part of the CI infrastructure.
-    p = parser.Parser()
-    args = p.parse_command_line()
-
-    # Jenkins is configured to archive build artifacts in a directory.
-    # The 'archives' variable holds the set of directories for
-    # successful Jenkins builds (those which run to completion).
-    # Partitioning of successful builds is useful since we can ignore
-    # failed build directories while searching for performance data.
-    archives = archive_path.Archive(args)
-    archives.generate()
-    print(archives.dump())
-
-    # Jenkins records changeset information in a changeset_delta.txt file.
-    # We parse the changelog for the commit hash and save it into
-    # 'builds_to_changesets'.
-    build_to_changeset = dictionary.BuildToChangesetDict()
-    build_to_changeset.generate(archives.dirs)
-    print(build_to_changeset.dump())
-
-    # 'changeset_to_build' holds the changeset mappings with a
-    # list of build numbers that match the changeset value.
-    # Builds may be executed many times with the same changeset.
-    # The most recent build (identified by the largest build number) will
-    # be used to retrieve performance data.
-    changeset_to_build = dictionary.ChangesetToBuildDict()
-    changeset_to_build.generate(build_to_changeset)
-    print(changeset_to_build.dump())
-
-    # Jenkins is configured to dump Gerrit-esque relation chain changesets
-    # to an archived output file 'changeset-delta.txt'.
-    # The relation chain will be used to determine changeset performance
-    # data for each changeset in the relation chain (when possible).
-    build_to_relation_chain = dictionary.BuildToRelationChainDict()
-    build_to_relation_chain.generate(archives.dirs)
-    print(build_to_relation_chain.dump())
-
-    perf_checker = checker.Performance(args,
-                                       archives,
-                                       changeset_to_build,
-                                       build_to_relation_chain)
-    perf_checker.run()
-
-if __name__ == '__main__':
-    main()
diff --git a/projects/rocshmem/internal/continuous_integration/short/checker.py b/projects/rocshmem/internal/continuous_integration/short/checker.py
deleted file mode 100644
index 3305115a56..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/checker.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import archive_path
-import log
-import dictionary
-import report
-import violation
-
-class Performance():
-    def __init__(self, args, archives, changeset_to_build,
-                 build_to_relation_chain):
-        self._args = args
-        self._archives = archives
-        self._changeset_to_build = changeset_to_build
-        self._build_to_relation_chain = build_to_relation_chain
-        self._build_id = build_to_relation_chain.most_recent_build()
-        self._archive_path = archives.path_of_build(self._build_id)
-        self._output = report.Report(self._build_id,
-                                     self._archive_path,
-                                     'performance_diff.txt')
-
-    def _other_build_id(self, other_changeset):
-        packed_id = [build_id for chng,
-                     build_id in self._changeset_to_build.data.items()
-                         if chng.startswith(other_changeset)]
-
-        # The 'packed_id' variable is a list containing lists.
-        # We need the content inside the packed_id data structure.
-        try:
-            build_id = packed_id[0][0]
-            return True, build_id
-        except IndexError:
-            # An index error can occur if builds in the relation chain
-            # have not been tested before attempting to test this
-            # changeset.
-            return False, 0
-
-    def _log_difference(self, log_filename, other_changeset,
-                        other_archive_path, violations):
-        print('determining difference of log file ' + log_filename)
-        self._output.record(log_filename)
-
-        current_file_path = self._archive_path + '/' + log_filename
-        other_file_path = other_archive_path + '/' + log_filename
-        log_pair = log.Pair(current_file_path, other_file_path)
-        log_pair.calculate_differences()
-
-        latency_perc = [float(i.strip('%')) \
-                for i in log_pair.latency_percentage_differences]
-        max_latency = max(latency_perc)
-        violations.check(max_latency, other_changeset, log_filename)
-
-        self._output.record(log_pair.dump())
-
-    def _changeset_difference(self, current_changeset, other_changeset):
-        violations = violation.Threshold(self._args.latency_max, 'latency')
-
-        change_pair = '(' + current_changeset + ',' + other_changeset + ')'
-        print('comparing changesets ' + change_pair)
-        self._output.record(change_pair)
-
-        status, other_build_id = self._other_build_id(other_changeset)
-        if status == False:
-            message = 'skipping changeset ' + other_changeset
-            print(message)
-            self._output.record(message)
-            return violations
-
-        other_archive_path = self._archives.path_of_build(other_build_id)
-        print(self._archive_path)
-        print(other_archive_path)
-
-        for filename in self._args.logs:
-            self._log_difference(filename, other_changeset,
-                                 other_archive_path, violations)
-        print('\n')
-
-        return violations
-
-    def _calculate_performance_differences(self):
-        current_changeset = \
-            self._build_to_relation_chain.data[self._build_id][0]
-        other_changesets = \
-            self._build_to_relation_chain.data[self._build_id][1:]
-
-        for other_changeset in other_changesets:
-            violations = self._changeset_difference(current_changeset,
-                                                    other_changeset)
-
-        # Only report on the last pairwise changeset combination.
-        # This combination represents the changeset being tested and
-        # the amd-master:HEAD.
-        violations.provide_violations_to_report(self._output)
-
-    def run(self):
-        self._output.open()
-        self._calculate_performance_differences()
diff --git a/projects/rocshmem/internal/continuous_integration/short/dictionary.py b/projects/rocshmem/internal/continuous_integration/short/dictionary.py
deleted file mode 100644
index f7861320b2..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/dictionary.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import abc
-import os
-import pprint
-import subprocess
-import sys
-
-class BaseDict(metaclass=abc.ABCMeta):
-    def __init__(self):
-        self.data = {}
-        self._delimiter_path = 'archive'
-        self._changeset_delta_filename = 'changeset_delta.txt'
-
-    def _build_id(self, build_directory):
-        sub_directory_strings = build_directory.split('/')
-        word_count = 0
-        for word in sub_directory_strings:
-            if word == '':
-                continue
-            if word == self._delimiter_path:
-                break
-            word_count += 1
-        bld_id = sub_directory_strings[word_count]
-        return bld_id
-
-    def _open_changeset_delta_file(self, archive_directory):
-        build_directory, config_directory = os.path.split(archive_directory)
-        changeset_file_path =  build_directory + '/' + \
-                               self._changeset_delta_filename
-        try:
-            file_handle = open(changeset_file_path, 'r')
-        except:
-            sys.exit('failed to open: ' + changeset_file_path)
-        return file_handle
-
-    @abc.abstractmethod
-    def _changeset_delta_operations(self, file_handle, bld_id):
-        pass
-
-    def generate(self, archives):
-        for d in archives:
-            bld_id = self._build_id(d)
-            f = self._open_changeset_delta_file(d)
-            self._changeset_delta_operations(f, bld_id)
-
-    def most_recent_build(self):
-        build_id_strings = self.data.keys()
-        build_id_ints = list(map(int, build_id_strings))
-        most_recent_build_id_int = max(build_id_ints)
-        return str(most_recent_build_id_int)
-
-    def dump(self):
-        str_out = self._print_text
-        str_out += pprint.pformat(self.data, width=120)
-        str_out += '\n'
-        return str_out
-
-class BuildToChangesetDict(BaseDict):
-    def __init__(self, name=''):
-        super().__init__()
-        self._print_text = name
-
-    def _changeset_delta_operations(self, file_handle, bld_id):
-        commit_line = file_handle.readline()
-        try:
-            commit_hash = commit_line.split()[0]
-        except IndexError:
-            commit_hash = None
-        if commit_hash != None:
-            self.data[bld_id] = commit_hash
-
-class BuildToRelationChainDict(BaseDict):
-    def __init__(self, name=''):
-        super().__init__()
-        self._print_text = name
-
-    def _changeset_delta_operations(self, file_handle, bld_id):
-        changes = []
-        for line in file_handle:
-            changes.append(line.split()[0])
-        self.data[bld_id] = changes
-
-class ChangesetToBuildDict():
-    def __init__(self, name=''):
-        self.data = {}
-        self._print_text = name
-
-    def _invert_dict(self, dictionary):
-        dict_with_duplicates = {}
-        for key, value in dictionary.data.items():
-            list_with_duplicates = dict_with_duplicates.get(value, [])
-            list_with_duplicates.append(key)
-            dict_with_duplicates[value] = list_with_duplicates
-        return dict_with_duplicates
-
-    def generate(self, dictionary):
-        self.data = self._invert_dict(dictionary)
-
-    def dump(self):
-        str_out = self._print_text
-        str_out += pprint.pformat(self.data, width=120)
-        str_out += '\n'
-        return str_out
-
-class ChangelogToMostRecentBuild():
-    def __init__(self, name=''):
-        self._print_text = name
-        self._all_changesets = []
-        self._changesets_with_builds = []
-        self._changesets_without_builds = []
-        self.data = {}
-
-    def _build_id(self, changeset_to_build, changeset):
-        try:
-            build_id_strings = changeset_to_build.data[changeset]
-            build_id_ints = list(map(int, build_id_strings))
-            most_recent_build_id_int = max(build_id_ints)
-            build_id_str = str(most_recent_build_id_int)
-        except:
-            build_id_str = ''
-        return build_id_str
-
-    def _changelog(self):
-        # print git hash along with file modification stats
-        shellcmd =  'git log --pretty=tformat:"%H" --shortstat | '
-        # condense the output down to single line
-        shellcmd += "awk 'ORS=NR%3?\" \":\"\\n\"' | "
-        # parse out the git hash by itself
-        shellcmd += "awk '{print $1}'"
-        x = subprocess.getoutput(shellcmd)
-        self._all_changesets = x.split()
-
-    def _with_builds(self, changeset_to_build):
-        changesets =  list(changeset_to_build.data.keys())
-        self._changesets_with_builds = changesets
-
-    def _without_builds(self):
-        self._changesets_without_builds = \
-            list(set(self._all_changesets) - \
-                 set(self._changesets_with_builds))
-
-    def generate(self, changeset_to_build):
-        self._changelog()
-        self._with_builds(changeset_to_build)
-        self._without_builds()
-        for changeset in self._all_changesets:
-            if changeset in self._changesets_with_builds:
-                build = self._build_id(changeset_to_build, changeset)
-                self.data[changeset] = build
-
-    def dump(self):
-        str_out = self._print_text
-        str_out += 'git-log_changesets_in_order:\n'
-        str_out += pprint.pformat(self._all_changesets, width=120)
-        str_out += '\nfilesystem_with_builds:\n'
-        str_out += pprint.pformat(self._changesets_with_builds, width=120)
-        str_out += '\nfilesystem_without_builds:\n'
-        str_out += pprint.pformat(self._changesets_without_builds, width=120)
-        str_out += '\ngit-log_changesets_to_build-id_mappings:\n'
-        str_out += pprint.pformat(self.data, width=120)
-        str_out += '\n'
-        return str_out
diff --git a/projects/rocshmem/internal/continuous_integration/short/log.py b/projects/rocshmem/internal/continuous_integration/short/log.py
deleted file mode 100644
index df63301f84..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/log.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import pprint
-import re
-import sys
-
-class Log():
-    def __init__(self, logfile_abspath):
-        self._file_path = logfile_abspath
-        self.latency = []
-        self.bandwidth = []
-        # regex matches the latency and bandwidth lines in the log files
-        self._regex = '.*[0-9]+\.[0-9]+.*[0-9]\.[0-9].*'
-
-    def open(self):
-        try:
-            self._file_handle = open(self._file_path, 'r')
-        except:
-            sys.exit('failed to open: ' + self._file_path)
-
-    def parse(self):
-        for line in self._file_handle:
-            if re.match(self._regex, line):
-                entries = line.split()
-                self.latency.append(round(float(entries[0]), 4))
-                self.bandwidth.append(round(float(entries[1]), 4))
-
-class Pair():
-    def __init__(self, first_logfile_abspath, second_logfile_abspath):
-        self.first = Log(first_logfile_abspath)
-        self.first.open()
-        self.first.parse()
-        self.second = Log(second_logfile_abspath)
-        self.second.open()
-        self.second.parse()
-
-    def _ratio(self, a, b):
-        diff = [round((x - y), 4) for x, y in zip(a, b)]
-        ratio = []
-        for numerator, denominator in zip(diff, a):
-            try:
-                ratio.append(round(numerator / denominator, 4))
-            except:
-                ratio.append(float(0.0000))
-        return ratio
-
-    def _percent(self, ratio):
-        perc = ['{0:.2%}'.format(x) for x in ratio]
-        return perc
-
-    def _percentage_difference(self, a, b):
-        ratio = self._ratio(a, b)
-        percent = self._percent(ratio)
-        return percent
-
-    def calculate_differences(self):
-        self.latency_percentage_differences = \
-            self._percentage_difference(self.first.latency,
-                                        self.second.latency)
-        self.bandwidth_percentage_differences = \
-            self._percentage_difference(self.first.bandwidth,
-                                        self.second.bandwidth)
-
-    def dump(self):
-        delim = ', '
-        output =  '\tlatency:'
-        output += '\n\t\t'
-        output += delim.join(map(str, self.first.latency))
-        output += '\n\t\t'
-        output += delim.join(map(str, self.second.latency))
-        output += '\n\t\t'
-        output += delim.join(map(str, self.latency_percentage_differences))
-        output += '\n\tbandwidth:'
-        output += '\n\t\t'
-        output += delim.join(map(str, self.first.bandwidth))
-        output += '\n\t\t'
-        output += delim.join(map(str, self.second.bandwidth))
-        output += '\n\t\t'
-        output += delim.join(map(str, self.bandwidth_percentage_differences))
-        return output
-
-class Tracker():
-    def __init__(self, args, archives):
-        self._args = args
-        self._archives = archives
-        self._data = {}
-
-    def add(self, changeset, most_recent_build_id):
-        archive_path = self._archives.path_of_build(most_recent_build_id)
-        for filename in self._args.logs:
-            abs_file_path = archive_path + '/' + filename
-            log = Log(abs_file_path)
-            log.open()
-            log.parse()
-            key = (changeset, filename)
-            self._data[key] = log
-
-    def dump(self):
-        out_str = ''
-        for key in self._data.keys():
-            log = self._data[key]
-            line_str = pprint.pformat(key, width=120)
-            line_str += ' = '
-            line_str += pprint.pformat(log.latency, width=120)
-            line_str += '\n'
-            out_str += line_str
-        return out_str
diff --git a/projects/rocshmem/internal/continuous_integration/short/parser.py b/projects/rocshmem/internal/continuous_integration/short/parser.py
deleted file mode 100644
index 2a122e2ac6..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/parser.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import argparse
-
-class Parser():
-    def __init__(self):
-        # A parent directory containing log file output from one of the
-        # configuration runs. The output directories are intended to
-        # be symmetric in naming with the various configurations supplied
-        # by the library's build_configs.
-        self._default_config = 'RC_SINGLE'
-
-        # The list of log files which need to be checked for performance
-        # differences.
-        self._default_logs = ['get.log',
-                              'get_nbi.log',
-                              'get_swarm.log',
-                              'put.log',
-                              'put_nbi.log']
-
-        # The maximum pairwise difference for the log file latencies.
-        self._default_latency_max = 5.0
-
-        # The minimum bandwidth difference for the log file bandwidths.
-        self._default_bandwidth_min = -50.0
-
-        # The Jenkins tester archives slave output on the master's
-        # filesystem which currently uses this top-level path (as the
-        # resperf account).
-        self._default_jenkins_path = \
-                '/proj/radl_extra/users/resperf/jenkins-2.192/'
-
-        # The performance tester runs as part of the 'short' job to
-        # verify that no performance degradation has occurred between
-        # commits. This archive path is the generic archive path
-        # for all of the builds. The Kleene star is used as a place
-        # holder for the Jenkins build number.
-        self._default_archive_path = \
-                'jobs/shmem_short/builds/*/archive/'
-
-        # The default benchmark path can be used to alter archive
-        # output placement. Currently, this is initialized to an empty
-        # string, but subsequently initialized to inject the config
-        # path.
-        self._default_benchmark_path = ''
-
-    def setup_options(self, argparser):
-        argparser.add_argument('-j',
-                               dest='jenkins_path',
-                               default=self._default_jenkins_path)
-        argparser.add_argument('-a',
-                               dest='archive_path',
-                               default=self._default_archive_path)
-        argparser.add_argument('-b',
-                               dest='benchmark_path',
-                               default=self._default_benchmark_path)
-        argparser.add_argument('-c',
-                               dest='config',
-                               default=self._default_config)
-        argparser.add_argument('-l',
-                               dest='logs',
-                               nargs='*',
-                               default=self._default_logs)
-        argparser.add_argument('-x',
-                               dest='latency_max',
-                               type=float,
-                               default=self._default_latency_max)
-        argparser.add_argument('-y',
-                               dest='bandwidth_min',
-                               type=float,
-                               default=self._default_bandwidth_min)
-        argparser.add_argument('-o',
-                               dest='one_changeset')
-        argparser.add_argument('-r',
-                               dest='changeset_range',
-                               nargs=2,
-                               metavar=("most_recent_changeset", "least_recent_changeset"))
-        return argparser
-
-    def parse_command_line(self):
-        p = argparse.ArgumentParser()
-        p = self.setup_options(p)
-        args = p.parse_args()
-        args.benchmark_path = args.config + args.benchmark_path
-        return args
diff --git a/projects/rocshmem/internal/continuous_integration/short/perf_config.xml b/projects/rocshmem/internal/continuous_integration/short/perf_config.xml
deleted file mode 100644
index 6b87cbe037..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/perf_config.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version='1.1' encoding='UTF-8'?>
-<project>
-  <actions/>
-  <description>shmem performance delta checker</description>
-  <keepDependencies>false</keepDependencies>
-  <properties/>
-  <scm class="hudson.scm.NullSCM"/>
-  <assignedNode>master</assignedNode>
-  <canRoam>false</canRoam>
-  <disabled>false</disabled>
-  <blockBuildWhenDownstreamBuilding>false</blockBuildWhenDownstreamBuilding>
-  <blockBuildWhenUpstreamBuilding>false</blockBuildWhenUpstreamBuilding>
-  <triggers/>
-  <concurrentBuild>false</concurrentBuild>
-  <builders>
-    <hudson.tasks.Shell>
-      <command>/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c &quot;RC_SINGLE&quot;    -x &quot;15.0&quot; -l put.log put_nbi.log get.log get_nbi.log amo_add.log amo_fadd.log amo_fcswap.log amo_fetch.log amo_finc.log amo_inc.log ping_pong.log
-/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c &quot;RC_MULTI&quot;     -x &quot;15.0&quot; -l put.log put_nbi.log get.log get_nbi.log amo_add.log amo_fadd.log amo_fcswap.log amo_fetch.log amo_finc.log amo_inc.log ping_pong.log get_swarm.log
-/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c &quot;DC_SINGLE&quot;    -x &quot;15.0&quot; -l put.log put_nbi.log get.log get_nbi.log ping_pong.log
-/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c &quot;DC_MULTI&quot;     -x &quot;15.0&quot; -l put.log put_nbi.log get.log get_nbi.log ping_pong.log get_swarm.log
-/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c &quot;RO_NET_BASIC&quot; -x &quot;75.0&quot; -l put.log put_nbi.log get.log get_nbi.log ping_pong.log
-</command>
-      <configuredLocalRules/>
-    </hudson.tasks.Shell>
-  </builders>
-  <publishers/>
-  <buildWrappers>
-    <hudson.plugins.timestamper.TimestamperBuildWrapper plugin="timestamper@1.12"/>
-  </buildWrappers>
-</project>
diff --git a/projects/rocshmem/internal/continuous_integration/short/plot.py b/projects/rocshmem/internal/continuous_integration/short/plot.py
deleted file mode 100755
index 825d4082f7..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/plot.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import parser
-import dictionary
-import archive_path
-import plotter
-
-def main():
-    p = parser.Parser()
-    args = p.parse_command_line()
-
-    archives = archive_path.Archive(args)
-    archives.generate()
-    print(archives.dump())
-
-    build_to_changeset = dictionary.BuildToChangesetDict()
-    build_to_changeset.generate(archives.dirs)
-    print(build_to_changeset.dump())
-
-    changeset_to_build = dictionary.ChangesetToBuildDict()
-    changeset_to_build.generate(build_to_changeset)
-    print(changeset_to_build.dump())
-
-    plot = plotter.Plot(args,
-                        archives,
-                        changeset_to_build)
-
-    # either plot with all the changesets or the slice provided
-    plot.changeset_slice()
-    
-    if (args.one_changeset):
-        plot.one_changeset_plot()
-
-if __name__ == '__main__':
-    main()
diff --git a/projects/rocshmem/internal/continuous_integration/short/plotter.R b/projects/rocshmem/internal/continuous_integration/short/plotter.R
deleted file mode 100755
index 5171e29069..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/plotter.R
+++ /dev/null
@@ -1,230 +0,0 @@
-#!/usr/bin/env Rscript
-
-# load the required libraries:
-library(tidyverse)
-library(RColorBrewer)
-library(optparse)
-
-# declare some helper functions
-ggpreview <- function (..., device = "png") {
-  fname <- tempfile(fileext = paste0(".", device))
-  ggplot2::ggsave(filename = fname, device = device, ...)
-  system2("open", fname)
-  invisible(NULL)
-}
-
-set_right_order <- function(df) {
-  # reverse the order of the rows so that oldest commit is first
-  df <- df %>% map_df(rev)
-  # ensure that ggplot plots the x-axis in the right order
-  df$Commit <- factor(df$Commit, levels = unique(df$Commit))
-  return(df)
-}
-
-plot_and_save <- function(df, xval, yval, title, subtitle, xlabel, filename) {
-  p <- ggplot(df, aes_string(x=xval, y=yval, group=1)) +
-    geom_line(size = 0.5, color=mycolors[1]) +
-    geom_point(size = 1.5, alpha = 1, color=mycolors[2]) +
-    theme_minimal() +
-    expand_limits(y=0) +
-    xlab(xlabel) +
-    ggtitle(title, subtitle = subtitle) +
-    theme(
-      axis.text.x = element_text(angle=90,hjust=1),
-      axis.title.y = element_blank()
-    ) +
-  scale_fill_manual(values = mycolors)
-  #ggpreview(width=7.5, height=5, units="in", dpi=500)
-  ggsave(filename, p, device=pdf, dpi=500)
-}
-
-## Set up options ##
-
-option_list = list(
-    make_option(c("-o", "--output"), type="character", default=NULL, action="store",
-                   help="path (without trailing /) to a folder that will
-                   contain the plots", metavar="folder-path"),
-    make_option(c("-a", "--changeset_a"), type="character", default=NULL, action="store",
-                   help="beginning (inclusive) changeset of slice", metavar="changeset"),
-    make_option(c("-b", "--changeset_b"), type="character", default=NULL, action="store",
-                   help="ending (inclusive) changeset of slice", metavar="changeset"),
-    make_option(c("-c", "--one_changeset"), type="character", default=NULL, action="store",
-                help="if set, will prepare plots for one changeset; if not, plots for a changeset slice")
-
-)
-
-## SCRIPT START ##
-
-# parse the options
-opt_parser <- OptionParser(option_list=option_list)
-opts <- parse_args(opt_parser)
-if (is.null(opts$output)) {
-    print_help(opt_parser)
-    stop("Please set the --output flag.", call.=FALSE)
-}
-slice_opt = 0
-single_opt = 0
-if (!is.null(opts$changeset_a) && !is.null(opts$changeset_b)) {
-    slice_opt = 1
-}
-if (!is.null(opts$one_changeset)) {
-    single_opt = 1
-}
-
-if ( (slice_opt && single_opt) || (!slice_opt && !single_opt) ) {
-    stop("Please supply a slice or a single changeset, not both.", call.= FALSE)
-}
-
-# choose color palette
-mycolors <- brewer.pal(5, "Set2")
-
-if (length(opts$one_changeset) > 0) {
-  ## Plotting data for a single changeset ##
-
-  # read the files
-  non_amo   <- read.csv("non_amo_one_changeset.csv", header=TRUE)
-  amo       <- read.csv("amo_one_changeset.csv", header=TRUE)
-  ping_pong <- read.csv("ping_pong_one_changeset.csv", header=TRUE)
-
-  # ensure that ggplot plots the x-axis in the right order
-  non_amo$size <- factor(non_amo$size, levels = unique(non_amo$size))
-  amo$op <- factor(amo$op, levels = unique(amo$op))
-
-  # plot
-  non_amo_ops <- list("put","put_nbi","get","get_nbi")
-  for (op in non_amo_ops) {
-    plot_and_save(df=non_amo,
-                  xval="size",
-                  yval=op,
-                  title=op,
-                  subtitle="Latency (us)",
-                  xlabel="Message size (bytes)",
-                  filename=paste(opts$output,"/",op,"_changeset_",opts$one_changeset,".pdf", sep="")
-                  )
-  }
-
-  # prepare data for plots with fixed message size and ops as x axis
-  non_amo$bsize <- paste("b",non_amo$size,sep="") # (so that the columns in non_amo_t start with a character)
-  non_amo_t <- setNames(data.frame(t(non_amo[,2:5])), non_amo[,6]) # transpose + set column names
-  non_amo_t$op <- colnames(non_amo[,2:5]) # make a column with operation names
-
-  sizes <- colnames(non_amo_t[,-(length(colnames(non_amo_t)))])
-  for (size in sizes) {
-    plot_and_save(df=non_amo_t,
-                  xval="op",
-                  yval=size,
-                  title=paste(sub('.', '', size),"byte"),
-                  subtitle="Latency (us)",
-                  xlabel="Operation",
-                  filename=paste(opts$output,"/",size,"_changeset_",opts$one_changeset,".pdf", sep="")
-    )
-  }
-
-  plot_and_save(df=amo,
-                xval="op",
-                yval="latency",
-                title="Atomics",
-                subtitle="Latency (us)",
-                xlabel="Operation",
-                filename=paste(opts$output,"/atomic_changeset_",opts$one_changeset,".pdf", sep="")
-  )
-
-  ping_pong$type <- c("ping_pong")
-  p<-ggplot(ping_pong, aes(x=type, y=latency, fill=type)) +
-    geom_bar(stat="identity", width=0.5) +
-    theme_minimal() +
-    ggtitle("Ping pong", subtitle = "Latency (us)") +
-    theme(
-      axis.title.y = element_blank(),
-      axis.text.y = element_blank(),
-      axis.title.x = element_blank(),
-      legend.position = "none"
-      ) +
-    coord_flip() +
-    scale_fill_manual(values = mycolors)
-  #ggpreview(width=7.5, height=5, units="in", dpi=500)
-  ggsave(paste(opts$output,"/ping_pong_changeset_",opts$one_changeset,".pdf", sep=""), p, device=pdf, dpi=500)
-
-} else {
-  ## Plotting across a changeset slice ##
-
-  # read the files
-  put             <- read.csv("put.csv", header=TRUE)
-  put_nbi         <- read.csv("put_nbi.csv", header=TRUE)
-  get             <- read.csv("get.csv", header=TRUE)
-  get_nbi         <- read.csv("get_nbi.csv", header=TRUE)
-  amo             <- read.csv("amo.csv", header=TRUE)
-  ping_pong       <- read.csv("ping_pong.csv", header=TRUE)
-
-  # slice out the commits
-  start   <- match(c(opts$changeset_a), put$Commit)
-  end     <- match(c(opts$changeset_b), put$Commit)
-  # (start and end should be the same for all the frames) #
-  put             <- put[start:end,]
-  put_nbi         <- put_nbi[start:end,]
-  get             <- get[start:end,]
-  get_nbi         <- get_nbi[start:end,]
-  amo             <- amo[start:end,]
-  ping_pong       <- ping_pong[start:end,]
-
-  put             <- set_right_order(put)
-  put_nbi         <- set_right_order(put_nbi)
-  get             <- set_right_order(get)
-  get_nbi         <- set_right_order(get_nbi)
-  amo             <- set_right_order(amo)
-  ping_pong       <- set_right_order(ping_pong)
-
-  # plot
-  non_amo_ops <- list("put","put_nbi","get","get_nbi")
-  sizes_to_subtitle_map <- list("b1"="1 byte",
-                                "b2"="2 bytes",
-                                "b4"="4 bytes",
-                                "b8"="8 bytes",
-                                "b16"="16 bytes",
-                                "b32"="32 bytes",
-                                "b64"="64 bytes",
-                                "b128"="128 bytes",
-                                "b256"="256 bytes",
-                                "b512"="512 bytes",
-                                "b1024"="1024 bytes",
-                                "b2048"="2048 bytes",
-                                "b4096"="4096 bytes",
-                                "b8192"="8192 bytes",
-                                "b16384"="16384 bytes",
-                                "b32768"="32768 bytes")
-  for (op in non_amo_ops) {
-    for (size in names(sizes_to_subtitle_map)) {
-      plot_and_save(df=eval(parse(text=op)),
-                    xval="Commit",
-                    yval=size,
-                    title=op,
-                    subtitle=paste("Latency (us) for ",sizes_to_subtitle_map[[size]],sep=""),
-                    xlabel="Commit (older to newer)",
-                    filename=paste(opts$output,"/",op,"_",size,".pdf", sep="")
-      )
-      }
-  }
-
-  amo_ops <- list("add","cswap","fadd","fcswap","fetch","finc","inc")
-  for (op in amo_ops) {
-    plot_and_save(df=amo,
-                  xval="Commit",
-                  yval=op,
-                  title=op,
-                  subtitle="Latency (us)",
-                  xlabel="Commit (older to newer)",
-                  filename=paste(opts$output,"/",op,".pdf", sep="")
-    )
-  }
-
-  plot_and_save(df=ping_pong,
-                xval="Commit",
-                yval="latency",
-                title="ping_pong",
-                subtitle="Latency (us)",
-                xlabel="Commit (older to newer)",
-                filename=paste(opts$output,"/","ping_pong.pdf", sep="")
-  )
-}
-
-## SCRIPT END ##
diff --git a/projects/rocshmem/internal/continuous_integration/short/plotter.py b/projects/rocshmem/internal/continuous_integration/short/plotter.py
deleted file mode 100644
index 2f86bc3711..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/plotter.py
+++ /dev/null
@@ -1,295 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import dictionary
-import log
-#import matplotlib.pyplot
-import numpy
-import csv
-import os
-import subprocess
-import sys
-
-class Plot():
-    def __init__(self, args, archives, changeset_to_build):
-        self._args = args
-        self._archives = archives
-        self._changelog = dictionary.ChangelogToMostRecentBuild()
-        self._changelog.generate(changeset_to_build)
-        print(self._changelog.dump())
-
-    def abbreviate_changesets(self, changesets):
-        return [changeset[0:8] for changeset in changesets]
-
-    @staticmethod
-    def write_dict_to_file(tracker, field_names, file_name):
-        with open(file_name, 'w') as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=field_names)
-            writer.writeheader()
-            writer.writerows(tracker)
-    
-    @staticmethod
-    def check_and_add_to_dict(dictionary, key, array):
-        if len(array) > 0:
-            dictionary[key] = array[0]
-        else:
-            dictionary[key] = 0
-
-    def changeset_slice(self):
-        self._log_tracker = log.Tracker(self._args, self._archives)
-        for changeset in self._changelog._all_changesets:
-            if changeset in self._changelog.data.keys():
-                build_id = self._changelog.data[changeset]
-                self._log_tracker.add(changeset, build_id)
-        print(self._log_tracker.dump())
-        
-        """
-        separate out dictionaries based on operation
-        and prepare them in a format that works with
-        the csv module
-        """
-        put_tracker = []
-        put_nbi_tracker = []
-        get_tracker = []
-        get_nbi_tracker = []
-        amo_tracker = []
-        ping_pong_tracker = []
-        prev_commit = list(self._log_tracker._data.keys())[0][0]
-        amo_dict = {}
-        for key, value in self._log_tracker._data.items():
-            if (key[1] == "put.log"):
-                put_tracker.append({'Commit':key[0][0:7],
-                                    'b1':value.latency[0],
-                                    'b2':value.latency[1],
-                                    'b4':value.latency[2],
-                                    'b8':value.latency[3],
-                                    'b16':value.latency[4],
-                                    'b32':value.latency[5],
-                                    'b64':value.latency[6],
-                                    'b128':value.latency[7],
-                                    'b256':value.latency[8],
-                                    'b512':value.latency[9],
-                                    'b1024':value.latency[10],
-                                    'b2048':value.latency[11],
-                                    'b4096':value.latency[12],
-                                    'b8192':value.latency[13],
-                                    'b16384':value.latency[14],
-                                    'b32768':value.latency[15]
-                                    })
-            if (key[1] == "put_nbi.log"):
-                put_nbi_tracker.append({'Commit':key[0][0:7],
-                                    'b1':value.latency[0],
-                                    'b2':value.latency[1],
-                                    'b4':value.latency[2],
-                                    'b8':value.latency[3],
-                                    'b16':value.latency[4],
-                                    'b32':value.latency[5],
-                                    'b64':value.latency[6],
-                                    'b128':value.latency[7],
-                                    'b256':value.latency[8],
-                                    'b512':value.latency[9],
-                                    'b1024':value.latency[10],
-                                    'b2048':value.latency[11],
-                                    'b4096':value.latency[12],
-                                    'b8192':value.latency[13],
-                                    'b16384':value.latency[14],
-                                    'b32768':value.latency[15]
-                                    })
-            if (key[1] == "get.log"):
-                get_tracker.append({'Commit':key[0][0:7],
-                                    'b1':value.latency[0],
-                                    'b2':value.latency[1],
-                                    'b4':value.latency[2],
-                                    'b8':value.latency[3],
-                                    'b16':value.latency[4],
-                                    'b32':value.latency[5],
-                                    'b64':value.latency[6],
-                                    'b128':value.latency[7],
-                                    'b256':value.latency[8],
-                                    'b512':value.latency[9],
-                                    'b1024':value.latency[10],
-                                    'b2048':value.latency[11],
-                                    'b4096':value.latency[12],
-                                    'b8192':value.latency[13],
-                                    'b16384':value.latency[14],
-                                    'b32768':value.latency[15]
-                                    })
-            if (key[1] == "get_nbi.log"):
-                get_nbi_tracker.append({'Commit':key[0][0:7],
-                                    'b1':value.latency[0],
-                                    'b2':value.latency[1],
-                                    'b4':value.latency[2],
-                                    'b8':value.latency[3],
-                                    'b16':value.latency[4],
-                                    'b32':value.latency[5],
-                                    'b64':value.latency[6],
-                                    'b128':value.latency[7],
-                                    'b256':value.latency[8],
-                                    'b512':value.latency[9],
-                                    'b1024':value.latency[10],
-                                    'b2048':value.latency[11],
-                                    'b4096':value.latency[12],
-                                    'b8192':value.latency[13],
-                                    'b16384':value.latency[14],
-                                    'b32768':value.latency[15]
-                                    })
-            if (key[1] == "ping_pong.log"):
-                ping_pong_tracker.append({'Commit':key[0][0:7],
-                                          'latency':value.latency[0]
-                                         })
-
-            # check to see if we have moved to a new commit
-            # if we have, store the dict in the amo_tracker
-            if (key[0] != prev_commit):
-                amo_dict['Commit'] = prev_commit[0:7]
-                amo_tracker.append(amo_dict.copy())
-                amo_dict.clear()
-            
-            prev_commit = key[0]
-            
-            if (key[1] == "amo_add.log"):
-                self.check_and_add_to_dict(amo_dict, 'add', value.latency)
-            if (key[1] == "amo_cswap.log"):
-                self.check_and_add_to_dict(amo_dict, 'cswap', value.latency)
-            if (key[1] == "amo_fadd.log"):
-                self.check_and_add_to_dict(amo_dict, 'fadd', value.latency)
-            if (key[1] == "amo_fcswap.log"):
-                self.check_and_add_to_dict(amo_dict, 'fcswap', value.latency)
-            if (key[1] == "amo_fetch.log"):
-                self.check_and_add_to_dict(amo_dict, 'fetch', value.latency)
-            if (key[1] == "amo_finc.log"):
-                self.check_and_add_to_dict(amo_dict, 'finc', value.latency)
-            if (key[1] == "amo_inc.log"):
-                self.check_and_add_to_dict(amo_dict, 'inc', value.latency)
-        
-        # store the last commit's amo data
-        amo_dict['Commit'] = prev_commit[0:7]
-        amo_tracker.append(amo_dict.copy())
-
-        # write put results into a file:
-        size_field_names= ['Commit','b1','b2','b4','b8','b16','b32','b64','b128','b256','b512','b1024','b2048','b4096','b8192','b16384','b32768']
-        amo_field_names= ['Commit','add','cswap','fadd','fcswap','fetch','finc','inc']
-        ping_pong_field_names= ['Commit','latency']
-
-        self.write_dict_to_file(put_tracker, size_field_names, "put.csv")
-        self.write_dict_to_file(put_nbi_tracker, size_field_names, "put_nbi.csv")
-        self.write_dict_to_file(get_tracker, size_field_names, "get.csv")
-        self.write_dict_to_file(get_nbi_tracker, size_field_names, "get_nbi.csv")
-        self.write_dict_to_file(amo_tracker, amo_field_names, "amo.csv")
-        self.write_dict_to_file(ping_pong_tracker, ping_pong_field_names, "ping_pong.csv")
-
-        # make a directory and execute the R script to generate plots in that directory
-        current_dir = os.getcwd()
-        plot_dir = os.path.join(current_dir, 'plots')
-        if not os.path.exists(plot_dir):
-            os.makedirs(plot_dir)
-
-        changeset_a = list(self._log_tracker._data.keys())[0][0]
-        changeset_b = list(self._log_tracker._data.keys())[-1][0]
-
-        # check if the provided changesets are correct
-        if (self._args.changeset_range):
-            found_changeset_a = False
-            found_changeset_b = False
-            for key, value in self._log_tracker._data.items():
-                if (found_changeset_a and found_changeset_b):
-                    break
-                if (not found_changeset_a):
-                    if (self._args.changeset_range[0] == key[0]):
-                        found_changeset_a = True
-                if (not found_changeset_b):
-                    if (self._args.changeset_range[1] == key[0]):
-                        found_changeset_b = True
-            
-            if ((not found_changeset_a) and (not found_changeset_b)):
-                sys.exit("One of the specified changesets was not found. Please specify correct/complete commit IDs.")
-            else:
-                changeset_a = self._args.changeset_range[0]
-                changeset_b = self._args.changeset_range[1]
-
-        r_command = "Rscript ./plotter.R -o ./plots -a " + changeset_a[0:7] + " -b " + changeset_b[0:7]
-
-        print(r_command)
-        subprocess.check_call(r_command, shell=True)
-
-
-    def one_changeset_plot(self):
-        found_changeset = 0
-        non_amo_tracker = []
-        amo_tracker = []
-        ping_pong_tracker = []
-        for key, value in self._log_tracker._data.items():
-            if (key[0] == self._args.one_changeset):
-                found_changeset = 1
-                if (key[1] == "put.log"):
-                    put_vals = value.latency
-                if (key[1] == "put_nbi.log"):
-                    put_nbi_vals = value.latency
-                if (key[1] == "get.log"):
-                    get_vals = value.latency
-                if (key[1] == "get_nbi.log"):
-                    get_nbi_vals = value.latency
-                if (key[1] == "amo_add.log"):
-                    amo_tracker.append({'op':'add',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "amo_add.log"):
-                    amo_tracker.append({'op':'add',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "amo_cswap.log"):
-                    amo_tracker.append({'op':'cswap',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "amo_fadd.log"):
-                    amo_tracker.append({'op':'fadd',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "amo_fcswap.log"):
-                    amo_tracker.append({'op':'fcswap',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "amo_fetch.log"):
-                    amo_tracker.append({'op':'fetch',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "amo_finc.log"):
-                    amo_tracker.append({'op':'finc',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "amo_inc.log"):
-                    amo_tracker.append({'op':'inc',
-                                        'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-                if (key[1] == "ping_pong.log"):
-                    ping_pong_tracker.append({'latency': value.latency[0] if len(value.latency) > 0 else 0
-                                        })
-
-
-        if (not found_changeset):
-            sys.exit("The requested changeset was not found. Please specify correct/complete commit IDs.")
-        
-        index = 0
-        for size in [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]:
-            non_amo_tracker.append({'size':size,
-                                    'put':put_vals[index],
-                                    'put_nbi':put_nbi_vals[index],
-                                    'get':get_vals[index],
-                                    'get_nbi':get_nbi_vals[index]
-                                    })
-            index = index + 1
-        
-        # write results into a file:
-        non_amo_field_names= ['size','put','put_nbi','get','get_nbi']
-        amo_field_names= ['op','latency']
-        ping_pong_field_names= ['latency']
-
-        self.write_dict_to_file(non_amo_tracker, non_amo_field_names, "non_amo_one_changeset.csv")
-        self.write_dict_to_file(amo_tracker, amo_field_names, "amo_one_changeset.csv")
-        self.write_dict_to_file(ping_pong_tracker, ping_pong_field_names, "ping_pong_one_changeset.csv")
-
-        # call the R script with an option that tells it to plot figures for
-        r_command = "Rscript ./plotter.R -o ./plots -c " + self._args.one_changeset
-
-        print(r_command)
-        subprocess.check_call(r_command, shell=True)
-
diff --git a/projects/rocshmem/internal/continuous_integration/short/report.py b/projects/rocshmem/internal/continuous_integration/short/report.py
deleted file mode 100644
index bca823b490..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/report.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import sys
-
-class Report():
-    def __init__(self, identifier, path, filename):
-        self._identifier = identifier
-        self._path = path
-        self._filename = filename
-
-    def open(self):
-        print('opening report for ' + self._identifier)
-        try:
-            report_path = self._path + '/' + self._filename
-            print('report_path: ' + report_path)
-            self._file_handle = open(report_path, 'w')
-        except:
-            sys.exit('failed to open report: ' + report_path)
-
-    def record(self, message):
-        self._file_handle.write(message + '\n')
diff --git a/projects/rocshmem/internal/continuous_integration/short/short_config.xml b/projects/rocshmem/internal/continuous_integration/short/short_config.xml
deleted file mode 100644
index bdb369c62d..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/short_config.xml
+++ /dev/null
@@ -1,96 +0,0 @@
-<?xml version='1.1' encoding='UTF-8'?>
-<flow-definition plugin="workflow-job@2.40">
-  <actions>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobAction plugin="pipeline-model-definition@1.8.4"/>
-    <org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction plugin="pipeline-model-definition@1.8.4">
-      <jobProperties/>
-      <triggers/>
-      <parameters/>
-      <options/>
-    </org.jenkinsci.plugins.pipeline.modeldefinition.actions.DeclarativeJobPropertyTrackerAction>
-  </actions>
-  <description></description>
-  <keepDependencies>false</keepDependencies>
-  <properties>
-    <org.jenkinsci.plugins.workflow.job.properties.DisableConcurrentBuildsJobProperty/>
-    <org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-      <triggers>
-        <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.GerritTrigger plugin="gerrit-trigger@2.33.0">
-          <spec></spec>
-          <gerritProjects>
-            <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.GerritProject>
-              <compareType>PLAIN</compareType>
-              <pattern>rsch/ec/shmem</pattern>
-              <branches>
-                <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.Branch>
-                  <compareType>PLAIN</compareType>
-                  <pattern>amd-master</pattern>
-                </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.Branch>
-              </branches>
-              <disableStrictForbiddenFileVerification>false</disableStrictForbiddenFileVerification>
-            </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.GerritProject>
-          </gerritProjects>
-          <dynamicGerritProjects class="empty-list"/>
-          <skipVote>
-            <onSuccessful>false</onSuccessful>
-            <onFailed>false</onFailed>
-            <onUnstable>false</onUnstable>
-            <onNotBuilt>false</onNotBuilt>
-            <onAborted>false</onAborted>
-          </skipVote>
-          <silentMode>false</silentMode>
-          <enableTopicAssociation>false</enableTopicAssociation>
-          <notificationLevel></notificationLevel>
-          <silentStartMode>false</silentStartMode>
-          <escapeQuotes>true</escapeQuotes>
-          <nameAndEmailParameterMode>PLAIN</nameAndEmailParameterMode>
-          <dependencyJobsNames>shmem_perf_check, </dependencyJobsNames>
-          <commitMessageParameterMode>BASE64</commitMessageParameterMode>
-          <changeSubjectParameterMode>PLAIN</changeSubjectParameterMode>
-          <commentTextParameterMode>BASE64</commentTextParameterMode>
-          <buildStartMessage></buildStartMessage>
-          <buildFailureMessage></buildFailureMessage>
-          <buildSuccessfulMessage></buildSuccessfulMessage>
-          <buildUnstableMessage></buildUnstableMessage>
-          <buildNotBuiltMessage></buildNotBuiltMessage>
-          <buildAbortedMessage></buildAbortedMessage>
-          <buildUnsuccessfulFilepath></buildUnsuccessfulFilepath>
-          <customUrl></customUrl>
-          <serverName>amd-gerrit</serverName>
-          <triggerOnEvents>
-            <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.events.PluginCommentAddedContainsEvent>
-              <commentAddedCommentContains>!SHORT</commentAddedCommentContains>
-            </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.events.PluginCommentAddedContainsEvent>
-          </triggerOnEvents>
-          <dynamicTriggerConfiguration>false</dynamicTriggerConfiguration>
-          <triggerConfigURL></triggerConfigURL>
-          <triggerInformationAction/>
-        </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.GerritTrigger>
-      </triggers>
-    </org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-  </properties>
-  <definition class="org.jenkinsci.plugins.workflow.cps.CpsScmFlowDefinition" plugin="workflow-cps@2.90">
-    <scm class="hudson.plugins.git.GitSCM" plugin="git@4.7.1">
-      <configVersion>2</configVersion>
-      <userRemoteConfigs>
-        <hudson.plugins.git.UserRemoteConfig>
-          <name>origin</name>
-          <refspec>${GERRIT_REFSPEC}</refspec>
-          <url>ssh://gerritgit/rsch/ec/shmem</url>
-        </hudson.plugins.git.UserRemoteConfig>
-      </userRemoteConfigs>
-      <branches>
-        <hudson.plugins.git.BranchSpec>
-          <name>FETCH_HEAD</name>
-        </hudson.plugins.git.BranchSpec>
-      </branches>
-      <doGenerateSubmoduleConfigurations>false</doGenerateSubmoduleConfigurations>
-      <submoduleCfg class="empty-list"/>
-      <extensions/>
-    </scm>
-    <scriptPath>internal/continuous_integration/short/Jenkinsfile</scriptPath>
-    <lightweight>false</lightweight>
-  </definition>
-  <triggers/>
-  <disabled>false</disabled>
-</flow-definition>
\ No newline at end of file
diff --git a/projects/rocshmem/internal/continuous_integration/short/violation.py b/projects/rocshmem/internal/continuous_integration/short/violation.py
deleted file mode 100644
index 4f7ece2de9..0000000000
--- a/projects/rocshmem/internal/continuous_integration/short/violation.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/tool/pandora64/.package/python-3.8.0/bin/python3
-
-import pprint
-import report
-import sys
-
-class Threshold():
-    def __init__(self, maximum_threshold, violation_type):
-        self._violations = {}
-        self._maximum_threshold = maximum_threshold
-        self._violation_type = violation_type
-
-    def check(self, value, changeset, filename):
-        if value >= self._maximum_threshold:
-            key = changeset + '|' + filename + '|' + self._violation_type
-            self._violations[key] = value
-            print(key + ': ' + str(value) + '%')
-
-    def provide_violations_to_report(self, report):
-        if self.has_violations():
-            report.record('FAILURE')
-            report.record(self.dump())
-            sys.exit(1)
-        else:
-            report.record('SUCCESS')
-            sys.exit(0)
-
-    def has_violations(self):
-        return bool(self._violations)
-
-    def dump(self):
-        str_out = pprint.pformat(self._violations, width=120)
-        str_out += '\n'
-        return str_out
diff --git a/projects/rocshmem/internal/continuous_integration/smoke/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/smoke/Jenkinsfile
deleted file mode 100644
index 592f2516de..0000000000
--- a/projects/rocshmem/internal/continuous_integration/smoke/Jenkinsfile
+++ /dev/null
@@ -1,151 +0,0 @@
-pipeline {
-    agent { label 'sv-pdp-5' }
-    environment {
-        HSA_FORCE_FINE_GRAIN_PCIE = 1
-        MPI_HOME="/home/resperf/mpich-4.0.1/install/global"
-        PATH = "$MPI_HOME/bin:$PATH"
-        LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH"
-        build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}"
-        CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake"
-    }
-    stages {
-        stage('Synchronize Source Code') {
-            steps {
-                checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]]
-            }
-        }
-        stage('Make Build Directory') {
-            steps {
-                dir("library") {
-                    sh "mkdir -p ${build_dir}"
-                }
-            }
-        }
-        stage('Build Source Code') {
-            parallel {
-                stage('RC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi_wf_coal install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/RC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/rc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/RC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install'
-                        }
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_SINGLE") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_single install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_SINGLE") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        //===================== LIBRARY =======================
-                        dir("library/${build_dir}/DC_MULTI") {
-                            sh 'mkdir -p install'
-                            sh '../../../build_configs/dc_multi install'
-                        }
-                        //===================== CLIENT ========================
-                        dir("clients/functional_tests/${build_dir}/DC_MULTI") {
-                            sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install'
-                        }
-                    }
-                }
-            }
-        }
-        stage('Run Tests') {
-            parallel {
-                stage('RC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI_WF_COAL') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL'
-                        }
-                    }
-                }
-
-                stage('RC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI'
-                        }
-                    }
-                }
-
-                stage('DC_SINGLE') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE'
-                        }
-                    }
-                }
-
-                stage('DC_MULTI') {
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI'
-                        }
-                    }
-                }
-
-                stage('RO_NET_BASIC') {
-                    // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI
-                    steps {
-                        dir("clients/functional_tests") {
-                            sh 'mkdir -p ${build_dir}/RO_NET_BASIC'
-                            sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC'
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/projects/rocshmem/internal/continuous_integration/smoke/smoke_config.xml b/projects/rocshmem/internal/continuous_integration/smoke/smoke_config.xml
deleted file mode 100644
index cdf8981d88..0000000000
--- a/projects/rocshmem/internal/continuous_integration/smoke/smoke_config.xml
+++ /dev/null
@@ -1,85 +0,0 @@
-<?xml version='1.1' encoding='UTF-8'?>
-<flow-definition plugin="workflow-job@2.40">
-  <actions/>
-  <description></description>
-  <keepDependencies>false</keepDependencies>
-  <properties>
-    <org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-      <triggers>
-        <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.GerritTrigger plugin="gerrit-trigger@2.33.0">
-          <spec></spec>
-          <gerritProjects>
-            <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.GerritProject>
-              <compareType>PLAIN</compareType>
-              <pattern>rsch/ec/shmem</pattern>
-              <branches>
-                <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.Branch>
-                  <compareType>PLAIN</compareType>
-                  <pattern>amd-master</pattern>
-                </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.Branch>
-              </branches>
-              <disableStrictForbiddenFileVerification>false</disableStrictForbiddenFileVerification>
-            </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.data.GerritProject>
-          </gerritProjects>
-          <dynamicGerritProjects class="empty-list"/>
-          <skipVote>
-            <onSuccessful>true</onSuccessful>
-            <onFailed>true</onFailed>
-            <onUnstable>true</onUnstable>
-            <onNotBuilt>true</onNotBuilt>
-            <onAborted>true</onAborted>
-          </skipVote>
-          <silentMode>false</silentMode>
-          <enableTopicAssociation>false</enableTopicAssociation>
-          <notificationLevel></notificationLevel>
-          <silentStartMode>false</silentStartMode>
-          <escapeQuotes>true</escapeQuotes>
-          <nameAndEmailParameterMode>PLAIN</nameAndEmailParameterMode>
-          <dependencyJobsNames></dependencyJobsNames>
-          <commitMessageParameterMode>BASE64</commitMessageParameterMode>
-          <changeSubjectParameterMode>PLAIN</changeSubjectParameterMode>
-          <commentTextParameterMode>BASE64</commentTextParameterMode>
-          <buildStartMessage></buildStartMessage>
-          <buildFailureMessage></buildFailureMessage>
-          <buildSuccessfulMessage></buildSuccessfulMessage>
-          <buildUnstableMessage></buildUnstableMessage>
-          <buildNotBuiltMessage></buildNotBuiltMessage>
-          <buildAbortedMessage></buildAbortedMessage>
-          <buildUnsuccessfulFilepath></buildUnsuccessfulFilepath>
-          <customUrl></customUrl>
-          <serverName>amd-gerrit</serverName>
-          <triggerOnEvents>
-            <com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.events.PluginCommentAddedContainsEvent>
-              <commentAddedCommentContains>!SMOKE</commentAddedCommentContains>
-            </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.events.PluginCommentAddedContainsEvent>
-          </triggerOnEvents>
-          <dynamicTriggerConfiguration>false</dynamicTriggerConfiguration>
-          <triggerConfigURL></triggerConfigURL>
-          <triggerInformationAction/>
-        </com.sonyericsson.hudson.plugins.gerrit.trigger.hudsontrigger.GerritTrigger>
-      </triggers>
-    </org.jenkinsci.plugins.workflow.job.properties.PipelineTriggersJobProperty>
-  </properties>
-  <definition class="org.jenkinsci.plugins.workflow.cps.CpsScmFlowDefinition" plugin="workflow-cps@2.90">
-    <scm class="hudson.plugins.git.GitSCM" plugin="git@4.7.1">
-      <configVersion>2</configVersion>
-      <userRemoteConfigs>
-        <hudson.plugins.git.UserRemoteConfig>
-          <url>ssh://gerritgit/rsch/ec/shmem</url>
-        </hudson.plugins.git.UserRemoteConfig>
-      </userRemoteConfigs>
-      <branches>
-        <hudson.plugins.git.BranchSpec>
-          <name>FETCH_HEAD</name>
-        </hudson.plugins.git.BranchSpec>
-      </branches>
-      <doGenerateSubmoduleConfigurations>false</doGenerateSubmoduleConfigurations>
-      <submoduleCfg class="empty-list"/>
-      <extensions/>
-    </scm>
-    <scriptPath>internal/continuous_integration/smoke/Jenkinsfile</scriptPath>
-    <lightweight>false</lightweight>
-  </definition>
-  <triggers/>
-  <disabled>false</disabled>
-</flow-definition>
\ No newline at end of file
diff --git a/projects/rocshmem/internal/scripts/cscope-index.py b/projects/rocshmem/internal/scripts/cscope-index.py
deleted file mode 100755
index 04e8698035..0000000000
--- a/projects/rocshmem/internal/scripts/cscope-index.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#! /usr/bin/python
-# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-import os
-
-suffixes = [ '.cpp', '.hpp', '.c', '.h' ]
-directories = [ 'src', 'include' ]
-
-def oksuffix(f):
-    for s in suffixes:
-        if f.endswith(s):
-            return True
-    return False
-
-def try_index_dir(directory):
-    for dirpath,subdirs,files in os.walk(os.path.join(cwd, directory)):
-        okfiles = [f for f in files if oksuffix(f)]
-        if okfiles:
-            print >> file_list, \
-                  '\n'.join([os.path.join(dirpath, f) for f in okfiles])
-
-
-file_list = file('cscope.files', 'w')
-cwd = os.getcwd()
-for d in directories:
-    try_index_dir(d)
-file_list.close()
-
-os.system("cscope -b")
diff --git a/projects/rocshmem/internal/workloads/Makefile b/projects/rocshmem/internal/workloads/Makefile
deleted file mode 100644
index 70dd89b582..0000000000
--- a/projects/rocshmem/internal/workloads/Makefile
+++ /dev/null
@@ -1,105 +0,0 @@
-HIPCC=hipcc
-BUILD=./build
-SRC=./src
-RESULTS=./results
-
-#rocshmem_DIR=${HOME}/rocshmem
-#MPI_HOME=${HOME}/mpich/install
-NCCL_HOME=${HOME}/rccl/build
-
-MPI_FLAGS=-lmpi -lhsa-runtime64 -lrt -L${MPI_HOME}/lib -fgpu-rdc 
-SHMEM_FLAGS=${MPI_FLAGS} -lmlx5 -libverbs 
-RCCL_FLAGS=${MPI_FLAGS} -Wl,-rpath,$(NCCL_HOME) -L${NCCL_HOME} -lrccl
-
-.SILENT: run_scan extract_scan run_sort run_sort_shmem run_sort_rccl extract_sort
-
-all: ${BUILD}/sort_shmem ${BUILD}/sort_rccl ${BUILD}/sort_mpi
-
-${BUILD}/sort_shmem: ${BUILD}/sort_shmem.o ${rocshmem_DIR}/lib/librocshmem.a
-	${HIPCC} $^ ${SHMEM_FLAGS} -o $@
-
-${BUILD}/sort_shmem.o: ${SRC}/sort_shmem.cu
-	${HIPCC} $^ -I${rocshmem_DIR}/include -I${MPI_HOME}/include -fgpu-rdc -o $@ -c
-
-${BUILD}/sort_rccl: ${BUILD}/sort_rccl.o
-	${HIPCC} $^ ${RCCL_FLAGS} -o $@
-
-${BUILD}/sort_rccl.o: ${SRC}/sort_rccl.cu
-	${HIPCC} $^ -I$(NCCL_HOME)/include/rccl -I${MPI_HOME}/include -fgpu-rdc -o $@ -c
-
-${BUILD}/sort_mpi: ${BUILD}/sort_mpi.o
-	${HIPCC} $^ ${MPI_FLAGS} -o $@
-
-${BUILD}/sort_mpi.o: ${SRC}/sort_mpi.cu
-	${HIPCC} $^ -I${MPI_HOME}/include -fgpu-rdc -o $@ -c
-
-RO_FLAGS=ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1
-ITERS?=0 1 2 3 4 5 6 7 8 9
-TIMEOUT=1m
-HOSTS=sv-pdp-0,sv-pdp-1,sv-pdp-2,sv-pdp-3
-SCAN_SIZE=1024
-PES=2 4 8 12 16
-PES_RCCL=2 4 8
-
-TYPE ?= Naive
-LABEL ?= naive
-PARAM ?= 0
-NUM_PES ?= 2
-
-run_sort_shmem: ${BUILD}/sort_shmem
-	printf "${TYPE} ";\
-	echo "" > ${RESULTS}/sort_${LABEL}_${NUM_PES}.out; \
-	for j in ${ITERS}; do \
-		${RO_FLAGS} timeout ${TIMEOUT} mpirun -np ${NUM_PES} -hosts ${HOSTS} ${BUILD}/sort_shmem ${PARAM} >> ${RESULTS}/sort_${LABEL}_${NUM_PES}.out;\
-	done;
-
-run_sort_rccl: ${BUILD}/sort_rccl
-	printf "RCCL "; \
-	echo "" > ${RESULTS}/sort_rccl_${NUM_PES}.out; \
-	for j in ${ITERS}; do \
-		timeout ${TIMEOUT} mpirun -np ${NUM_PES} -hosts ${HOSTS} ${BUILD}/sort_rccl >> ${RESULTS}/sort_rccl_${NUM_PES}.out;\
-	done;
-
-run_sort_mpi: ${BUILD}/sort_rccl
-	printf "MPI2 "; \
-	echo "" > ${RESULTS}/sort_mpi2_${NUM_PES}.out; \
-	for j in ${ITERS}; do \
-		timeout ${TIMEOUT} mpirun -np ${NUM_PES} -hosts ${HOSTS} ${BUILD}/sort_mpi >> ${RESULTS}/sort_mpi2_${NUM_PES}.out;\
-	done;
-
-run_sort: ${BUILD}/sort_shmem ${BUILD}/sort_rccl
-	for i in ${PES}; do \
-		printf "%d " $$i; \
-		$(MAKE) --no-print-directory run_sort_shmem TYPE=NAIVE LABEL=naive PARAM=0 NUM_PES=$${i}; \
-		$(MAKE) --no-print-directory run_sort_shmem TYPE=MPI LABEL=mpi PARAM=1 NUM_PES=$${i}; \
-		$(MAKE) --no-print-directory run_sort_shmem TYPE=GCEN LABEL=gcen PARAM=2 NUM_PES=$${i}; \
-		$(MAKE) --no-print-directory run_sort_shmem TYPE=GCEN2 LABEL=gcen2 PARAM=3 NUM_PES=$${i}; \
-		$(MAKE) --no-print-directory run_sort_mpi NUM_PES=$${i}; \
-		printf "\n";\
-	done
-	for i in ${PES_RCCL}; do \
-		$(MAKE) --no-print-directory run_sort_rccl NUM_PES=$${i}; \
-		printf "%d " $$i; \
-	done
-
-	$(MAKE) extract_sort
-
-
-extract_sort:
-	printf "Sort latency\n"
-	printf "PROCS\tType\tRuns"
-	for i in ${PES}; do \
-		for type in mpi mpi2 rccl naive gcen gcen2; do\
-			printf "\n%d\t$${type}\t" $$i; \
-			file=${RESULTS}/sort_$${type}_$${i}.out;\
-			latency=$$(grep -E "Avg time" $${file}); \
-			grep -E "Avg time" $${file} | while read -r j; do\
-				val=$$(echo $$j | grep -oE -m1 "[0-9]+\.[0-9]+");\
-				printf "%s\t" $${val};\
-			done; \
-		done;\
-	done
-	printf "\n"
-
-clean: 
-	rm build/*;
diff --git a/projects/rocshmem/internal/workloads/src/common.h b/projects/rocshmem/internal/workloads/src/common.h
deleted file mode 100644
index a3f109103d..0000000000
--- a/projects/rocshmem/internal/workloads/src/common.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <chrono>
-#include <iostream>
-#include <stdio.h>
-#include <mpi.h>
-#include <unistd.h>
-#include <hip/hip_runtime.h>
-using namespace std;
-
-#define TIME_NOW std::chrono::steady_clock::now()
-#define TIME_DIFF(a, b) std::chrono::duration_cast<std::chrono::nanoseconds>(a - b).count()
-
-#define HIPCHECK(cmd) do {                         \
-  hipError_t e = cmd;                              \
-  if( e != hipSuccess ) {                          \
-    printf("Failed: Hip error %s:%d '%s'\n",             \
-        __FILE__,__LINE__,hipGetErrorString(e));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-
-#define NCCLCHECK(cmd) do {                         \
-  ncclResult_t r = cmd;                             \
-  if (r!= ncclSuccess) {                            \
-    printf("Failed, NCCL error %s:%d '%s'\n",             \
-        __FILE__,__LINE__,ncclGetErrorString(r));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-// Copied from rccl-tests, used to hash hostname
-static uint64_t getHash(const char* string, size_t n) {
-  // Based on DJB2a, result = result * 33 ^ char
-  uint64_t result = 5381;
-  for (size_t c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
-  }
-  return result;
-}
-
-/* Generate a hash of the unique identifying string for this host
- * that will be unique for both bare-metal and container instances
- * Equivalent of a hash of;
- *
- * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
- *
- */
-#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
-static uint64_t getHostHash(const char* hostname) {
-  char hostHash[1024];
-
-  // Fall back is the hostname if something fails
-  (void) strncpy(hostHash, hostname, sizeof(hostHash));
-  int offset = strlen(hostHash);
-
-  FILE *file = fopen(HOSTID_FILE, "r");
-  if (file != NULL) {
-    char *p;
-    if (fscanf(file, "%ms", &p) == 1) {
-        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
-        free(p);
-    }
-  }
-  fclose(file);
-
-  // Make sure the string is terminated
-  hostHash[sizeof(hostHash)-1]='\0';
-
-  return getHash(hostHash, strlen(hostHash));
-}
diff --git a/projects/rocshmem/internal/workloads/src/sort.h b/projects/rocshmem/internal/workloads/src/sort.h
deleted file mode 100644
index 482ba7ee2d..0000000000
--- a/projects/rocshmem/internal/workloads/src/sort.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*************************************************************************
- *                                                                       * 
- *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
- *                                                                       * 
- *                                  I S                                  * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
- *   It is described in NAS Technical Report 95-020.                     * 
- *                                                                       * 
- *   Permission to use, copy, distribute and modify this software        * 
- *   for any purpose with or without fee is hereby granted.  We          * 
- *   request, however, that all derived work reference the NAS           * 
- *   Parallel Benchmarks 3.3. This software is provided "as is"          *
- *   without express or implied warranty.                                * 
- *                                                                       * 
- *   Information on NPB 3.3, including the technical report, the         *
- *   original specifications, source code, results and information       * 
- *   on how to submit new results, is available at:                      * 
- *                                                                       * 
- *          http://www.nas.nasa.gov/Software/NPB                         * 
- *                                                                       * 
- *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
- *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
- *                                                                       * 
- *         NAS Parallel Benchmarks Group                                 * 
- *         NASA Ames Research Center                                     * 
- *         Mail Stop: T27A-1                                             * 
- *         Moffett Field, CA   94035-1000                                * 
- *                                                                       * 
- *         E-mail:  npb@nas.nasa.gov                                     * 
- *         Fax:     (650) 604-3957                                       * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   Author: M. Yarrow                                                   * 
- *           H. Jin                                                      * 
- *                                                                       * 
- *************************************************************************/
-
-#define NUM_WGS 1
-#define WG_SIZE 1024
-#define MAX_PES 128
-
-#define MAX_KEY (1 << 11)
-
-/*
- *    FUNCTION RANDLC (X, A)
- *
- *  This routine returns a uniform pseudorandom double precision number in the
- *  range (0, 1) by using the linear congruential generator
- *
- *  x_{k+1} = a x_k  (mod 2^46)
- *
- *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
- *  before repeating.  The argument A is the same as 'a' in the above formula,
- *  and X is the same as x_0.  A and X must be odd double precision integers
- *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
- *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
- *  the new seed x_1, so that subsequent calls to RANDLC using the same
- *  arguments will generate a continuous sequence.
- *
- *  This routine should produce the same results on any computer with at least
- *  48 mantissa bits in double precision floating point data.  On Cray systems,
- *  double precision should be disabled.
- *
- *  David H. Bailey     October 26, 1990
- *
- *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- *     SAVE KS, R23, R46, T23, T46
- *     DATA KS/0/
- *
- *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
- *  by merely using the ** operator, in order to insure that the results are
- *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
- */
-
-
-
-/*****************************************************************/
-/*************           R  A  N  D  L  C             ************/
-/*************                                        ************/
-/*************    portable random number generator    ************/
-/*****************************************************************/
-
-double	randlc( double *X, double *A )
-{
-      static int        KS=0;
-      static double	R23, R46, T23, T46;
-      double		T1, T2, T3, T4;
-      double		A1;
-      double		A2;
-      double		X1;
-      double		X2;
-      double		Z;
-      int     		i, j;
-
-      if (KS == 0) 
-      {
-        R23 = 1.0;
-        R46 = 1.0;
-        T23 = 1.0;
-        T46 = 1.0;
-    
-        for (i=1; i<=23; i++)
-        {
-          R23 = 0.50 * R23;
-          T23 = 2.0 * T23;
-        }
-        for (i=1; i<=46; i++)
-        {
-          R46 = 0.50 * R46;
-          T46 = 2.0 * T46;
-        }
-        KS = 1;
-      }
-
-/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
-
-      T1 = R23 * *A;
-      j  = T1;
-      A1 = j;
-      A2 = *A - T23 * A1;
-
-/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
-    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
-    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
-
-      T1 = R23 * *X;
-      j  = T1;
-      X1 = j;
-      X2 = *X - T23 * X1;
-      T1 = A1 * X2 + A2 * X1;
-      
-      j  = R23 * T1;
-      T2 = j;
-      Z = T1 - T23 * T2;
-      T3 = T23 * Z + A2 * X2;
-      j  = R46 * T3;
-      T4 = j;
-      *X = T3 - T46 * T4;
-      return(R46 * *X);
-} 
-
-
-
-/*****************************************************************/
-/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
-/************                                         ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
-/*
- * Create a random number sequence of total length nn residing
- * on np number of processors.  Each processor will therefore have a 
- * subsequence of length nn/np.  This routine returns that random 
- * number which is the first random number for the subsequence belonging
- * to processor rank kn, and which is used as seed for proc kn ran # gen.
- */
-
-double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
-                       int  np,       /* np = num procs                      */
-                       long nn,       /* total num of ran numbers, all procs */
-                       double s,      /* Ran num seed, for ex.: 314159265.00 */
-                       double a )     /* Ran num gen mult, try 1220703125.00 */
-{
-
-  long   i;
-
-  double t1,t2,t3,an;
-  long   mq,nq,kk,ik;
-
-
-
-      nq = nn / np;
-
-      for( mq=0; nq>1; mq++,nq/=2 )
-          ;
-
-      t1 = a;
-
-      for( i=1; i<=mq; i++ )
-        t2 = randlc( &t1, &t1 );
-
-      an = t1;
-
-      kk = kn;
-      t1 = s;
-      t2 = an;
-
-      for( i=1; i<=100; i++ )
-      {
-        ik = kk / 2;
-        if( 2 * ik !=  kk ) 
-            t3 = randlc( &t1, &t2 );
-        if( ik == 0 ) 
-            break;
-        t3 = randlc( &t2, &t2 );
-        kk = ik;
-      }
-
-      return( t1 );
-
-}
-
-
-
-
-/*****************************************************************/
-/*************      C  R  E  A  T  E  _  S  E  Q      ************/
-/*****************************************************************/
-
-void	create_seq( double seed, double a, int *key_array, int size )
-{
-	double x;
-	int    i, k;
-
-        k = MAX_KEY/4;
-
-	for (i=0; i < size; i++)
-	{
-	    x = randlc(&seed, &a);
-	    x += randlc(&seed, &a);
-    	    x += randlc(&seed, &a);
-	    x += randlc(&seed, &a);  
-
-            key_array[i] = k*x;
-	}
-}
\ No newline at end of file
diff --git a/projects/rocshmem/internal/workloads/src/sort_mpi.cu b/projects/rocshmem/internal/workloads/src/sort_mpi.cu
deleted file mode 100644
index 3907c6aefd..0000000000
--- a/projects/rocshmem/internal/workloads/src/sort_mpi.cu
+++ /dev/null
@@ -1,380 +0,0 @@
-#include "mpi.h"
-#include "common.h"
-#include "sort.h"
-
-//#define TIME_PERF
-#ifdef TIME_PERF
-#define TIMERS 10
-__device__ uint64_t timers[TIMERS] = {0};
-__device__ uint64_t time_start;
-#define TIMERS_START() \
-    if(threadIdx.x == 0) {\
-        time_start = rocshmem_timer();\
-    }
-
-#define TIME(TIMER_NUM) \
-    if(threadIdx.x == 0) {\
-        timers[TIMER_NUM] = rocshmem_timer() - time_start;\
-        time_start = rocshmem_timer();\
-    }
-
-#define OUTPUT_TIME() \
-    if(threadIdx.x == 0 && my_pe == 0) { \
-        uint64_t sum = 0; \
-        for(int i = 0; i < TIMERS; ++i) { \
-            sum += timers[i]; \
-        } \
-        for(int i = 0; i < TIMERS; ++i) { \
-            printf("%d: %f\n", i, (double)timers[i] / (double)sum); \
-        } \
-    }
-#else
-#define TIMERS_START()
-#define TIME(x)
-#define OUTPUT_TIME() 
-#endif
-
-__global__ void sort1(volatile int *keys, int *keyBuffer1,
-                     int *keyBuffer2, int *sendCount, 
-                     int *recvCount, int *sendOffset,
-                     int *recvOffset, int *outputKeys, 
-                     size_t size, int n_pes, int my_pe) {
-    __shared__ int bucketCounter[MAX_PES];
-    __shared__ int bucketPtr[MAX_PES];
-    __shared__ int total_size;
-
-    int buckets = n_pes;
-
-    int tid = threadIdx.x; // + blockDim.x * blockIdx.x;
-    const int K_PER_BUCK = (MAX_KEY / buckets);
-
-    // Reset
-    for(int i = threadIdx.x; i < buckets; i += blockDim.x) {
-        bucketCounter[i] = 0;
-        bucketPtr[i] = 0;
-    }
-    __syncthreads();
-    TIMERS_START()
-    // Count size of each bucket
-    for(int i = tid; i < size; i += blockDim.x) {
-        atomicAdd(&bucketCounter[keys[i] / K_PER_BUCK], 1);
-    }
-    __syncthreads();
-    TIME(0)
-    // Update in global memory
-    for(int i = tid; i < buckets; i += blockDim.x) {
-        sendCount[i] = bucketPtr[i] = bucketCounter[i];
-    }
-    __syncthreads();
-    TIME(1)
-    // Perform local scan to get ptrs set
-    for(int shift = 1; shift < buckets; shift *= 2) {
-        int temp = 0;
-        if(threadIdx.x >= shift && threadIdx.x < buckets) {
-            temp = bucketPtr[threadIdx.x - shift];
-        }
-        __syncthreads();
-        if(threadIdx.x < buckets) {
-            bucketPtr[threadIdx.x] += temp;
-        }
-        __syncthreads();
-    }
-    __syncthreads();
-    TIME(2)
-    // Find offsets of where we're sending
-    for(int i = threadIdx.x; i < buckets; i += blockDim.x) {
-        sendOffset[i] = bucketPtr[i] - sendCount[i];
-    }
-    // Sort keys into buckets
-    for(int i = threadIdx.x; i < size; i += blockDim.x) {
-        int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1;
-        keyBuffer1[loc] = keys[i];
-    }
-    TIME(3)
-    OUTPUT_TIME()
-}
-
-__global__ void sort2(volatile int *keys, int *keyBuffer1,
-                     int *keyBuffer2, int *sendCount, 
-                     int *recvCount, int *sendOffset,
-                     int *recvOffset, int *outputKeys, 
-                     size_t size, int n_pes, int my_pe) {
-    __shared__ int total_size;
-
-    int buckets = n_pes;
-
-    int tid = threadIdx.x; // + blockDim.x * blockIdx.x;
-    const int K_PER_BUCK = (MAX_KEY / buckets);
-
-    for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x)
-        outputKeys[i] = 0;
-    __syncthreads();
-    TIME(5)
-    int min_key_val = my_pe * K_PER_BUCK;
-    int max_key_val = (my_pe + 1) * K_PER_BUCK - 1;
-
-    int *key_buff_ptr = outputKeys - min_key_val;
-    for(int i = threadIdx.x; i < total_size; i += blockDim.x) {
-        atomicAdd(&key_buff_ptr[keyBuffer2[i]], 1);
-    }
-    __syncthreads();
-    TIME(6)
-    // Perform local scan on keys
-    for(int shift = 1; shift < K_PER_BUCK; shift *= 2) {
-        int temp = 0;
-        if(threadIdx.x >= shift && threadIdx.x < K_PER_BUCK) {
-            temp = outputKeys[threadIdx.x - shift];
-        }
-        __syncthreads();
-        if(threadIdx.x < K_PER_BUCK) {
-            outputKeys[threadIdx.x] += temp;
-        }
-        __syncthreads();
-    }
-    TIME(7)
-    OUTPUT_TIME()
-}
-
-void sort(volatile int *keys, int *keyBuffer1,
-            int *keyBuffer2, int *sendCount, 
-            int *recvCount, int *sendOffset,
-            int *recvOffset, int *outputKeys, 
-            size_t size, int max_iters) {
-    int nProcs, my_pe;
-    MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
-    
-    hipStream_t stream;
-    HIPCHECK(hipStreamCreate(&stream));
-
-    for(int iter = 0; iter < max_iters; ++iter) {
-        //fprintf(stderr, "%d: %d %d %p %p\n", my_pe, iter, max_iters, sendCount, recvCount);
-        sort1<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1,
-                     keyBuffer2, sendCount, recvCount, sendOffset,
-                     recvOffset, outputKeys, size, nProcs, my_pe);
-        HIPCHECK(hipStreamSynchronize(stream));
-        MPI_Alltoall(sendCount, 1, MPI_INT, recvCount, 1, 
-                               MPI_INT, MPI_COMM_WORLD);
-        MPI_Alltoall(sendOffset, 1, MPI_INT, recvOffset, 1, 
-                               MPI_INT, MPI_COMM_WORLD);
-        int total_size = 0;
-        MPI_Request *req = new MPI_Request[2 * nProcs];
-        const int TAG = 10000;
-        for(int i = 0; i < nProcs; ++i) {
-            MPI_Isend(&keyBuffer1[sendOffset[i]], sendCount[i], 
-                     MPI_INT, i, TAG, MPI_COMM_WORLD, &req[2 * i]);
-            MPI_Irecv(&keyBuffer2[total_size], recvCount[i], 
-                     MPI_INT, i, TAG, MPI_COMM_WORLD, &req[2 * i + 1]);
-            total_size += recvCount[i];
-        }
-        MPI_Waitall(2 * nProcs, req, MPI_STATUS_IGNORE);
-        sort2<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1,
-                     keyBuffer2, sendCount, recvCount, sendOffset,
-                     recvOffset, outputKeys, size, nProcs, my_pe);
-    }
-}
-
-bool verify(int *outputKeys, int *keyBuffer2, size_t size)
-{   
-    int num_pes, my_pe;
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
-
-    MPI_Status  status;
-    MPI_Request request;
-
-    int min_key_val = my_pe * (MAX_KEY / num_pes);
-    int max_key_val = (my_pe + 1) * (MAX_KEY / num_pes) - 1;
-
-    int *key_array = new int[size];
-    // Perform final untimed sort on keys
-    for(int i = 0; i < size; ++i)
-        if(outputKeys[keyBuffer2[i] - min_key_val] > 0)
-            key_array[--outputKeys[keyBuffer2[i] - min_key_val]] = keyBuffer2[i];
-        else {
-            fprintf(stderr, "%d: Found wrong key %d at %d with %d\n", my_pe, keyBuffer2[i], i, outputKeys[keyBuffer2[i]]);
-            return false;
-        }
-
-    if(size < 1)
-        size = 1;
-
-    int k;
-    const int MPI_TAG = 1000;
-    // Check if largest key is smaller than next processor's
-    if(my_pe > 0)
-        MPI_Irecv(&k, 1, MPI_INT, my_pe - 1, MPI_TAG, MPI_COMM_WORLD,
-                  &request);                   
-    if(my_pe < num_pes - 1)
-        MPI_Send(&key_array[size - 1], 1, MPI_INT, my_pe + 1, MPI_TAG,
-                 MPI_COMM_WORLD );
-    if(my_pe > 0)
-        MPI_Wait(&request, &status);
-
-    // Check if it is smaller
-    int j = 0;
-    if( my_pe > 0 && size > 1 )
-        if( k > key_array[0] )
-            j++;
-
-    // Check if keys correctly sorted
-    for(int i = 1; i < size; i++)
-        if(key_array[i - 1] > key_array[i])
-            j++;
-
-    delete[] key_array;
-
-    if(j != 0) {
-        fprintf(stderr, "Processor %d:  Full_verify: number of keys out of sort: %d\n",
-                my_pe, j );
-        return false;
-    }
-    return true;
-}
-
-void initGPU() 
-{
-    // Calculation for local rank, taken from rccl-tests
-    int localRank = 0;
-    int nProcs, proc;
-    MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &proc);
-    char hostname[1024];
-    gethostname(hostname, 1024);
-    for (int i=0; i< 1024; i++) {
-        if (hostname[i] == '.') {
-            hostname[i] = '\0';
-            break;
-        }
-    }
-    uint64_t hostHashs[nProcs];
-    hostHashs[proc] = getHostHash(hostname);
-    MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-    for (int p=0; p<nProcs; p++) {
-      if (p == proc) break;
-      if (hostHashs[p] == hostHashs[proc]) localRank++;
-    }
-    
-    /***
-     * Select a GPU
-     */    
-    int ndevices, my_device=0;
-    hipGetDeviceCount (&ndevices);
-    my_device = localRank % ndevices;
-    hipSetDevice(my_device);
-
-    printf("Rank %d: Device %d, Host %s\n", proc, my_device, hostname);
-    fflush(stdout);
-    MPI_Barrier(MPI_COMM_WORLD);
-}
-
-void *rocshmem_malloc(size_t size)
-{
-    void *v;
-    hipMalloc((void **)&v, size);
-    return v;
-}
-
-int rocshmem_free(void *v)
-{
-    return hipFree(v);
-}
-
-int main(int argc, char *argv[])
-{
-    if(argc < 1) {
-        printf("Format: %s [iterations]\n", argv[0]);
-        return -1;
-    }
-
-    // Init stuff
-    MPI_Init(&argc, &argv);
-    initGPU();
-
-    int iterations = 1000;
-    if(argc > 1)
-        iterations = atoi(argv[1]);
-    
-    int num_pes, my_pe;
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
-
-    // Configure input and outputs
-    size_t size = 1024; //atoi(argv[1]);
-    int *keys, *outputKeys;
-    hipMalloc((void**)&keys, sizeof(int) * size);
-    hipMalloc((void**)&outputKeys, sizeof(int) * WG_SIZE);
-
-/*  Generate random number sequence and subsequent keys on all procs */
-    create_seq( find_my_seed( my_pe, 
-                              num_pes, 
-                              4*(long)size*num_pes,
-                              314159265.00,      /* Random number gen seed */
-                              1220703125.00 ),   /* Random number gen mult */
-                1220703125.00, keys, size );     /* Random number gen mult */
-
-
-    // Init buffers
-    int *keyBuffer1, *keyBuffer2;
-    keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
-    keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
-    
-    int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0;
-    sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-
-    printf("Begin untimed run\n");
-    // Untimed run
-    MPI_Barrier(MPI_COMM_WORLD);
-    sort((int*)keys, keyBuffer1, keyBuffer2, 
-        sendCount, recvCount, sendOffset, recvOffset, 
-        outputKeys, size, 1);
-    hipDeviceSynchronize();
-
-    printf("Verify untimed run\n");
-    // Verify correctness
-    if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) {
-        fprintf(stderr, "Wrong output\n");
-        return -1;
-    }
-
-    printf("Begin timed run\n");
-    // Timed run
-    MPI_Barrier(MPI_COMM_WORLD);
-    auto time_start = TIME_NOW;
-    sort((int*)keys, keyBuffer1, keyBuffer2, 
-        sendCount, recvCount, sendOffset, recvOffset, 
-        outputKeys, size, iterations);
-    hipDeviceSynchronize();
-    double tot_time = (double)TIME_DIFF(TIME_NOW, time_start);
-    
-    double all_time = 0;
-    MPI_Allreduce(&tot_time, &all_time, 1,
-        MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-    if(my_pe == 0) {
-        printf("Avg time:\t%.3f\tus\n", all_time / (double)(1000.0 * iterations * num_pes));
-    }
-
-    // Verify correctness
-    if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) {
-        fprintf(stderr, "Wrong output\n");
-        return -1;
-    }
-
-    fprintf(stderr, "Done verify for %d\n", my_pe);
-
-    // Clean up
-    hipFree(keys);
-    hipFree(outputKeys);
-    rocshmem_free(keyBuffer1);
-    rocshmem_free(keyBuffer2);
-    rocshmem_free(sendCount);
-    rocshmem_free(recvCount);
-    rocshmem_free(sendOffset);
-    rocshmem_free(recvOffset);
-    MPI_Finalize();
-    return 0;
-}
diff --git a/projects/rocshmem/internal/workloads/src/sort_rccl.cu b/projects/rocshmem/internal/workloads/src/sort_rccl.cu
deleted file mode 100644
index b9e0f12536..0000000000
--- a/projects/rocshmem/internal/workloads/src/sort_rccl.cu
+++ /dev/null
@@ -1,394 +0,0 @@
-#include "rccl.h"
-#include "common.h"
-#include "sort.h"
-
-//#define TIME_PERF
-#ifdef TIME_PERF
-#define TIMERS 10
-__device__ uint64_t timers[TIMERS] = {0};
-__device__ uint64_t time_start;
-#define TIMERS_START() \
-    if(threadIdx.x == 0) {\
-        time_start = rocshmem_timer();\
-    }
-
-#define TIME(TIMER_NUM) \
-    if(threadIdx.x == 0) {\
-        timers[TIMER_NUM] = rocshmem_timer() - time_start;\
-        time_start = rocshmem_timer();\
-    }
-
-#define OUTPUT_TIME() \
-    if(threadIdx.x == 0 && my_pe == 0) { \
-        uint64_t sum = 0; \
-        for(int i = 0; i < TIMERS; ++i) { \
-            sum += timers[i]; \
-        } \
-        for(int i = 0; i < TIMERS; ++i) { \
-            printf("%d: %f\n", i, (double)timers[i] / (double)sum); \
-        } \
-    }
-#else
-#define TIMERS_START()
-#define TIME(x)
-#define OUTPUT_TIME() 
-#endif
-
-__global__ void sort1(volatile int *keys, int *keyBuffer1,
-                     int *keyBuffer2, int *sendCount, 
-                     int *recvCount, int *sendOffset,
-                     int *recvOffset, int *outputKeys, 
-                     size_t size, int n_pes, int my_pe) {
-    __shared__ int bucketCounter[MAX_PES];
-    __shared__ int bucketPtr[MAX_PES];
-    __shared__ int total_size;
-
-    int buckets = n_pes;
-
-    int tid = threadIdx.x; // + blockDim.x * blockIdx.x;
-    const int K_PER_BUCK = (MAX_KEY / buckets);
-
-    // Reset
-    for(int i = threadIdx.x; i < buckets; i += blockDim.x) {
-        bucketCounter[i] = 0;
-        bucketPtr[i] = 0;
-    }
-    __syncthreads();
-    TIMERS_START()
-    // Count size of each bucket
-    for(int i = tid; i < size; i += blockDim.x) {
-        atomicAdd(&bucketCounter[keys[i] / K_PER_BUCK], 1);
-    }
-    __syncthreads();
-    TIME(0)
-    // Update in global memory
-    for(int i = tid; i < buckets; i += blockDim.x) {
-        sendCount[i] = bucketPtr[i] = bucketCounter[i];
-    }
-    __syncthreads();
-    TIME(1)
-    // Perform local scan to get ptrs set
-    for(int shift = 1; shift < buckets; shift *= 2) {
-        int temp = 0;
-        if(threadIdx.x >= shift && threadIdx.x < buckets) {
-            temp = bucketPtr[threadIdx.x - shift];
-        }
-        __syncthreads();
-        if(threadIdx.x < buckets) {
-            bucketPtr[threadIdx.x] += temp;
-        }
-        __syncthreads();
-    }
-    __syncthreads();
-    TIME(2)
-    // Find offsets of where we're sending
-    for(int i = threadIdx.x; i < buckets; i += blockDim.x) {
-        sendOffset[i] = bucketPtr[i] - sendCount[i];
-    }
-    // Sort keys into buckets
-    for(int i = threadIdx.x; i < size; i += blockDim.x) {
-        int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1;
-        keyBuffer1[loc] = keys[i];
-    }
-    TIME(3)
-    OUTPUT_TIME()
-}
-
-__global__ void sort2(volatile int *keys, int *keyBuffer1,
-                     int *keyBuffer2, int *sendCount, 
-                     int *recvCount, int *sendOffset,
-                     int *recvOffset, int *outputKeys, 
-                     size_t size, int n_pes, int my_pe) {
-    __shared__ int total_size;
-
-    int buckets = n_pes;
-
-    int tid = threadIdx.x; // + blockDim.x * blockIdx.x;
-    const int K_PER_BUCK = (MAX_KEY / buckets);
-
-    for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x)
-        outputKeys[i] = 0;
-    __syncthreads();
-    TIME(5)
-    int min_key_val = my_pe * K_PER_BUCK;
-    int max_key_val = (my_pe + 1) * K_PER_BUCK - 1;
-
-    int *key_buff_ptr = outputKeys - min_key_val;
-    for(int i = threadIdx.x; i < total_size; i += blockDim.x) {
-        atomicAdd(&key_buff_ptr[keyBuffer2[i]], 1);
-    }
-    __syncthreads();
-    TIME(6)
-    // Perform local scan on keys
-    for(int shift = 1; shift < K_PER_BUCK; shift *= 2) {
-        int temp = 0;
-        if(threadIdx.x >= shift && threadIdx.x < K_PER_BUCK) {
-            temp = outputKeys[threadIdx.x - shift];
-        }
-        __syncthreads();
-        if(threadIdx.x < K_PER_BUCK) {
-            outputKeys[threadIdx.x] += temp;
-        }
-        __syncthreads();
-    }
-    TIME(7)
-    OUTPUT_TIME()
-}
-
-void sort(volatile int *keys, int *keyBuffer1,
-            int *keyBuffer2, int *sendCount, 
-            int *recvCount, int *sendOffset,
-            int *recvOffset, int *outputKeys, 
-            size_t size, int max_iters, ncclComm_t comm) {
-    int nProcs, my_pe;
-    MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
-    
-    hipStream_t stream;
-    HIPCHECK(hipStreamCreate(&stream));
-
-    for(int iter = 0; iter < max_iters; ++iter) {
-        //fprintf(stderr, "%d: %d %d %p %p\n", my_pe, iter, max_iters, sendCount, recvCount);
-        sort1<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1,
-                     keyBuffer2, sendCount, recvCount, sendOffset,
-                     recvOffset, outputKeys, size, nProcs, my_pe);
-        NCCLCHECK(ncclAllToAll(sendCount, recvCount, 1, 
-                               ncclInt, comm, stream));
-        NCCLCHECK(ncclAllToAll(sendOffset, recvOffset, 1, 
-                               ncclInt, comm, stream));
-        HIPCHECK(hipStreamSynchronize(stream));
-        NCCLCHECK(ncclGroupStart());
-        int total_size = 0;
-        for(int i = 0; i < nProcs; ++i) {
-            ncclSend(&keyBuffer1[sendOffset[i]], sendCount[i], 
-                     ncclInt, i, comm, stream);
-            ncclRecv(&keyBuffer2[total_size], recvCount[i], 
-                     ncclInt, i, comm, stream);
-            total_size += recvCount[i];
-        }
-        NCCLCHECK(ncclGroupEnd());
-        HIPCHECK(hipStreamSynchronize(stream));
-        sort2<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1,
-                     keyBuffer2, sendCount, recvCount, sendOffset,
-                     recvOffset, outputKeys, size, nProcs, my_pe);
-        HIPCHECK(hipStreamSynchronize(stream));
-    }
-}
-
-bool verify(int *outputKeys, int *keyBuffer2, size_t size)
-{   
-    int num_pes, my_pe;
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
-
-    MPI_Status  status;
-    MPI_Request request;
-
-    int min_key_val = my_pe * (MAX_KEY / num_pes);
-    int max_key_val = (my_pe + 1) * (MAX_KEY / num_pes) - 1;
-
-    int *key_array = new int[size];
-    // Perform final untimed sort on keys
-    for(int i = 0; i < size; ++i)
-        if(outputKeys[keyBuffer2[i] - min_key_val] > 0)
-            key_array[--outputKeys[keyBuffer2[i] - min_key_val]] = keyBuffer2[i];
-        else {
-            fprintf(stderr, "%d: Found wrong key %d at %d with %d\n", my_pe, keyBuffer2[i], i, outputKeys[keyBuffer2[i]]);
-            return false;
-        }
-
-    if(size < 1)
-        size = 1;
-
-    int k;
-    const int MPI_TAG = 1000;
-    // Check if largest key is smaller than next processor's
-    if(my_pe > 0)
-        MPI_Irecv(&k, 1, MPI_INT, my_pe - 1, MPI_TAG, MPI_COMM_WORLD,
-                  &request);                   
-    if(my_pe < num_pes - 1)
-        MPI_Send(&key_array[size - 1], 1, MPI_INT, my_pe + 1, MPI_TAG,
-                 MPI_COMM_WORLD );
-    if(my_pe > 0)
-        MPI_Wait(&request, &status);
-
-    // Check if it is smaller
-    int j = 0;
-    if( my_pe > 0 && size > 1 )
-        if( k > key_array[0] )
-            j++;
-
-    // Check if keys correctly sorted
-    for(int i = 1; i < size; i++)
-        if(key_array[i - 1] > key_array[i])
-            j++;
-
-    delete[] key_array;
-
-    if(j != 0) {
-        fprintf(stderr, "Processor %d:  Full_verify: number of keys out of sort: %d\n",
-                my_pe, j );
-        return false;
-    }
-    return true;
-}
-
-void initGPU(ncclComm_t &comms) 
-{
-    // Calculation for local rank, taken from rccl-tests
-    int localRank = 0;
-    int nProcs, proc;
-    MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &proc);
-    char hostname[1024];
-    gethostname(hostname, 1024);
-    for (int i=0; i< 1024; i++) {
-        if (hostname[i] == '.') {
-            hostname[i] = '\0';
-            break;
-        }
-    }
-    uint64_t hostHashs[nProcs];
-    hostHashs[proc] = getHostHash(hostname);
-    MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-    for (int p=0; p<nProcs; p++) {
-      if (p == proc) break;
-      if (hostHashs[p] == hostHashs[proc]) localRank++;
-    }
-    
-    /***
-     * Select a GPU
-     */    
-    int ndevices, my_device=0;
-    hipGetDeviceCount (&ndevices);
-    my_device = localRank % ndevices;
-    hipSetDevice(my_device);
-    
-    ncclUniqueId ncclId;
-    if (proc == 0) {
-        NCCLCHECK(ncclGetUniqueId(&ncclId));
-    }
-    MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
-    MPI_Barrier(MPI_COMM_WORLD);
-
-#ifdef RCCL_MULTIRANKPERGPU
-	NCCLCHECK(ncclCommInitRankMulti(&comms, nProcs, ncclId, proc, proc));
-#else
-	NCCLCHECK(ncclCommInitRank(&comms, nProcs, ncclId, proc));
-#endif
-
-    printf("Rank %d: Device %d, Host %s\n", proc, my_device, hostname);
-    fflush(stdout);
-    MPI_Barrier(MPI_COMM_WORLD);
-}
-
-void *rocshmem_malloc(size_t size)
-{
-    void *v;
-    hipMalloc((void **)&v, size);
-    return v;
-}
-
-int rocshmem_free(void *v)
-{
-    return hipFree(v);
-}
-
-int main(int argc, char *argv[])
-{
-    if(argc < 1) {
-        printf("Format: %s [iterations]\n", argv[0]);
-        return -1;
-    }
-
-    // Init stuff
-    MPI_Init(&argc, &argv);
-    ncclComm_t comms;
-    initGPU(comms);
-
-    int iterations = 1000;
-    if(argc > 1)
-        iterations = atoi(argv[1]);
-    
-    int num_pes, my_pe;
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
-
-    // Configure input and outputs
-    size_t size = 1024; //atoi(argv[1]);
-    int *keys, *outputKeys;
-    hipMalloc((void**)&keys, sizeof(int) * size);
-    hipMalloc((void**)&outputKeys, sizeof(int) * WG_SIZE);
-
-/*  Generate random number sequence and subsequent keys on all procs */
-    create_seq( find_my_seed( my_pe, 
-                              num_pes, 
-                              4*(long)size*num_pes,
-                              314159265.00,      /* Random number gen seed */
-                              1220703125.00 ),   /* Random number gen mult */
-                1220703125.00, keys, size );     /* Random number gen mult */
-
-
-    // Init buffers
-    int *keyBuffer1, *keyBuffer2;
-    keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
-    keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
-    
-    int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0;
-    sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-
-    printf("Begin untimed run\n");
-    // Untimed run
-    MPI_Barrier(MPI_COMM_WORLD);
-    sort((int*)keys, keyBuffer1, keyBuffer2, 
-        sendCount, recvCount, sendOffset, recvOffset, 
-        outputKeys, size, 1, comms);
-    hipDeviceSynchronize();
-
-    printf("Verify untimed run\n");
-    // Verify correctness
-    if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) {
-        fprintf(stderr, "Wrong output\n");
-        return -1;
-    }
-
-    printf("Begin timed run\n");
-    // Timed run
-    MPI_Barrier(MPI_COMM_WORLD);
-    auto time_start = TIME_NOW;
-    sort((int*)keys, keyBuffer1, keyBuffer2, 
-        sendCount, recvCount, sendOffset, recvOffset, 
-        outputKeys, size, iterations, comms);
-    hipDeviceSynchronize();
-    double tot_time = (double)TIME_DIFF(TIME_NOW, time_start);
-    
-    double all_time = 0;
-    MPI_Allreduce(&tot_time, &all_time, 1,
-        MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-    if(my_pe == 0) {
-        printf("Avg time:\t%.3f\tus\n", all_time / (double)(1000.0 * iterations * num_pes));
-    }
-
-    // Verify correctness
-    if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) {
-        fprintf(stderr, "Wrong output\n");
-        return -1;
-    }
-
-    // Clean up
-    hipFree(keys);
-    hipFree(outputKeys);
-    rocshmem_free(keyBuffer1);
-    rocshmem_free(keyBuffer2);
-    rocshmem_free(sendCount);
-    rocshmem_free(recvCount);
-    rocshmem_free(sendOffset);
-    rocshmem_free(recvOffset);
-    ncclCommDestroy(comms);
-    MPI_Finalize();
-    return 0;
-}
diff --git a/projects/rocshmem/internal/workloads/src/sort_shmem.cu b/projects/rocshmem/internal/workloads/src/sort_shmem.cu
deleted file mode 100644
index a6f88c6ab4..0000000000
--- a/projects/rocshmem/internal/workloads/src/sort_shmem.cu
+++ /dev/null
@@ -1,358 +0,0 @@
-#include <iostream>
-#include <stdio.h>
-#include <mpi.h>
-#include <rocshmem/rocshmem.hpp>
-#include <unistd.h>
-using namespace std;
-using namespace rocshmem;
-
-#include "common.h"
-#include "sort.h"
-
-//#define TIME_PERF
-#ifdef TIME_PERF
-#define TIMERS 10
-__device__ uint64_t timers[TIMERS] = {0};
-__device__ uint64_t time_start;
-#define TIMERS_START() \
-    if(threadIdx.x == 0) {\
-        time_start = rocshmem_timer();\
-    }
-
-#define TIME(TIMER_NUM) \
-    if(threadIdx.x == 0) {\
-        timers[TIMER_NUM] = rocshmem_timer() - time_start;\
-        time_start = rocshmem_timer();\
-    }
-
-#define OUTPUT_TIME() \
-    if(threadIdx.x == 0 && my_pe == 0) { \
-        uint64_t sum = 0; \
-        for(int i = 0; i < TIMERS; ++i) { \
-            sum += timers[i]; \
-        } \
-        for(int i = 0; i < TIMERS; ++i) { \
-            printf("%d: %f\n", i, (double)timers[i] / (double)sum); \
-        } \
-    }
-#else
-#define TIMERS_START()
-#define TIME(x)
-#define OUTPUT_TIME() 
-#endif
-
-__device__ __inline__ void alltoall(rocshmem_ctx_t &ctx, 
-                                    rocshmem_team_t team, 
-                                    int *dst, int *src) {
-    // Perform alltoall
-    rocshmem_ctx_int_wg_alltoall(ctx,
-                team,
-                dst,    // T* dest
-                src,  // const T* source
-                1);       // int nelement
-}
-
-__global__ void sort(volatile int *keys, int *keyBuffer1,
-                     int *keyBuffer2, int *sendCount, 
-                     int *recvCount, int *sendOffset,
-                     int *recvOffset, int *outputKeys, 
-                     size_t size, rocshmem_team_t team, 
-                     int max_iters) {
-    __shared__ rocshmem_ctx_t ctx;
-    __shared__ int bucketCounter[MAX_PES];
-    __shared__ int bucketPtr[MAX_PES];
-    __shared__ int total_size;
-
-    rocshmem_wg_init();
-    rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx);
-
-    int n_pes = rocshmem_ctx_n_pes(ctx);
-    int my_pe = rocshmem_my_pe();
-    int buckets = n_pes;
-
-    int tid = threadIdx.x; // + blockDim.x * blockIdx.x;
-    const int K_PER_BUCK = (MAX_KEY / buckets);
-
-    for(int iter = 0; iter < max_iters; ++iter) {
-        // Reset
-        for(int i = threadIdx.x; i < buckets; i += blockDim.x) {
-            bucketCounter[i] = 0;
-            bucketPtr[i] = 0;
-        }
-        __syncthreads();
-        TIMERS_START()
-        // Count size of each bucket
-        for(int i = tid; i < size; i += blockDim.x) {
-            atomicAdd(&bucketCounter[keys[i] / K_PER_BUCK], 1);
-        }
-        __syncthreads();
-        TIME(0)
-        // Update in global memory
-        for(int i = tid; i < buckets; i += blockDim.x) {
-            sendCount[i] = bucketPtr[i] = bucketCounter[i];
-        }
-        __syncthreads();
-        TIME(1)
-        // Perform local scan to get ptrs set
-        for(int shift = 1; shift < buckets; shift *= 2) {
-            int temp = 0;
-            if(threadIdx.x >= shift && threadIdx.x < buckets) {
-                temp = bucketPtr[threadIdx.x - shift];
-            }
-            __syncthreads();
-            if(threadIdx.x < buckets) {
-                bucketPtr[threadIdx.x] += temp;
-            }
-            __syncthreads();
-        }
-        __syncthreads();
-        TIME(2)
-        // Find offsets of where we're sending
-        for(int i = threadIdx.x; i < buckets; i += blockDim.x) {
-            sendOffset[i] = bucketPtr[i] - sendCount[i];
-        }
-        // Sort keys into buckets
-        for(int i = threadIdx.x; i < size; i += blockDim.x) {
-            int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1;
-            keyBuffer1[loc] = keys[i];
-        }
-        rocshmem_ctx_threadfence_system(ctx);
-        // Force sync to wait for all PEs to update bucket sizes
-        rocshmem_ctx_wg_team_sync(ctx, team);
-        TIME(3)
-        // Let all PEs know how many keys you wish to send
-        alltoall(ctx, team, recvCount, sendCount);
-        // Let all PEs know where the offsets are of the keys
-        alltoall(ctx, team, recvOffset, sendOffset);
-        __syncthreads();
-        TIME(4)
-        if(threadIdx.x == 0) {
-            total_size = 0;
-            for(int i = 0; i < buckets; ++i) {
-                rocshmem_int_get_nbi(&keyBuffer2[total_size], 
-                    &keyBuffer1[recvOffset[i]], recvCount[i], i);
-                total_size += recvCount[i];
-            }
-            rocshmem_quiet();
-        }
-        for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x)
-            outputKeys[i] = 0;
-        __syncthreads();
-        TIME(5)
-        int min_key_val = my_pe * K_PER_BUCK;
-        int max_key_val = (my_pe + 1) * K_PER_BUCK - 1;
-
-        int *key_buff_ptr = outputKeys - min_key_val;
-        for(int i = threadIdx.x; i < total_size; i += blockDim.x) {
-            atomicAdd(&key_buff_ptr[keyBuffer2[i]], 1);
-        }
-        __syncthreads();
-        TIME(6)
-        // Perform local scan on keys
-        for(int shift = 1; shift < K_PER_BUCK; shift *= 2) {
-            int temp = 0;
-            if(threadIdx.x >= shift && threadIdx.x < K_PER_BUCK) {
-                temp = outputKeys[threadIdx.x - shift];
-            }
-            __syncthreads();
-            if(threadIdx.x < K_PER_BUCK) {
-                outputKeys[threadIdx.x] += temp;
-            }
-            __syncthreads();
-        }
-        TIME(7)
-    }
-    OUTPUT_TIME()
-    rocshmem_wg_ctx_destroy(ctx);
-    rocshmem_wg_finalize();
-}
-
-bool verify(int *outputKeys, int *keyBuffer2, size_t size)
-{   
-    int num_pes = rocshmem_n_pes();
-    int my_pe = rocshmem_my_pe();
-
-    MPI_Status  status;
-    MPI_Request request;
-
-    int min_key_val = my_pe * (MAX_KEY / num_pes);
-    int max_key_val = (my_pe + 1) * (MAX_KEY / num_pes) - 1;
-
-    int *key_array = new int[size];
-    // Perform final untimed sort on keys
-    for(int i = 0; i < size; ++i)
-        if(outputKeys[keyBuffer2[i] - min_key_val] > 0)
-            key_array[--outputKeys[keyBuffer2[i] - min_key_val]] = keyBuffer2[i];
-        else {
-            fprintf(stderr, "%d: Found wrong key %d at %d with %d\n", my_pe, keyBuffer2[i], i, outputKeys[keyBuffer2[i]]);
-            return false;
-        }
-
-    if(size < 1)
-        size = 1;
-
-    int k;
-    const int MPI_TAG = 1000;
-    // Check if largest key is smaller than next processor's
-    if(my_pe > 0)
-        MPI_Irecv(&k, 1, MPI_INT, my_pe - 1, MPI_TAG, MPI_COMM_WORLD,
-                  &request);                   
-    if(my_pe < num_pes - 1)
-        MPI_Send(&key_array[size - 1], 1, MPI_INT, my_pe + 1, MPI_TAG,
-                 MPI_COMM_WORLD );
-    if(my_pe > 0)
-        MPI_Wait(&request, &status);
-
-    // Check if it is smaller
-    int j = 0;
-    if( my_pe > 0 && size > 1 )
-        if( k > key_array[0] )
-            j++;
-
-    // Check if keys correctly sorted
-    for(int i = 1; i < size; i++)
-        if(key_array[i - 1] > key_array[i])
-            j++;
-
-    delete[] key_array;
-
-    if(j != 0) {
-        fprintf(stderr, "Processor %d:  Full_verify: number of keys out of sort: %d\n",
-                my_pe, j );
-        return false;
-    }
-    return true;
-}
-
-void initGPU() 
-{
-    // Calculation for local rank, taken from rccl-tests
-    int localRank = 0;
-    int proc = rocshmem_my_pe();
-    int nProcs = rocshmem_n_pes();
-    char hostname[1024];
-    gethostname(hostname, 1024);
-    for (int i=0; i< 1024; i++) {
-        if (hostname[i] == '.') {
-            hostname[i] = '\0';
-            break;
-        }
-    }
-    uint64_t hostHashs[nProcs];
-    hostHashs[proc] = getHostHash(hostname);
-    MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-    for (int p=0; p<nProcs; p++) {
-      if (p == proc) break;
-      if (hostHashs[p] == hostHashs[proc]) localRank++;
-    }
-    
-    /***
-     * Select a GPU
-     */    
-    int ndevices, my_device=0;
-    hipGetDeviceCount (&ndevices);
-    my_device = localRank % ndevices;
-    hipSetDevice(my_device);
-
-    printf("Rank %d: Device %d, Host %s\n", proc, my_device, hostname);
-    fflush(stdout);
-    MPI_Barrier(MPI_COMM_WORLD);
-}
-
-int main(int argc, char *argv[])
-{
-    // Init rocshmem stuff
-    initGPU();
-    rocshmem_init(NUM_WGS);
-    int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD);
-    rocshmem_team_t team_world_dup = ROCSHMEM_TEAM_INVALID;
-    rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD,
-                                 0,
-                                 1,
-                                 n_pes,
-                                 nullptr,
-                                 0,
-                                 &team_world_dup);
-
-    int iterations = 1000;
-    if(argc > 1)
-        iterations = atoi(argv[1]);
-    
-    int num_pes = rocshmem_n_pes();
-    int my_pe = rocshmem_my_pe();
-
-    // Configure input and outputs
-    size_t size = 1024; //atoi(argv[2]);
-    int *keys, *outputKeys;
-    hipMalloc((void**)&keys, sizeof(int) * size);
-    hipMalloc((void**)&outputKeys, sizeof(int) * WG_SIZE);
-
-/*  Generate random number sequence and subsequent keys on all procs */
-    create_seq( find_my_seed( my_pe, 
-                              num_pes, 
-                              4*(long)size*num_pes,
-                              314159265.00,      /* Random number gen seed */
-                              1220703125.00 ),   /* Random number gen mult */
-                1220703125.00, keys, size );     /* Random number gen mult */
-
-
-    // Init buffers
-    int *keyBuffer1, *keyBuffer2;
-    keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size);
-    keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4);
-    
-    int *sendCount, *recvCount, *sendOffset, *recvOffset;
-    sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-    recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES);
-
-    // Untimed run
-    rocshmem_barrier_all();
-    sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2, 
-        sendCount, recvCount, sendOffset, recvOffset, 
-        outputKeys, size, team_world_dup, 1);
-    hipDeviceSynchronize();
-
-    // Verify correctness
-    if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) {
-        fprintf(stderr, "Wrong output\n");
-        return -1;
-    }
-
-    // Timed run
-    rocshmem_barrier_all();
-    auto time_start = TIME_NOW;
-    sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2, 
-        sendCount, recvCount, sendOffset, recvOffset, 
-        outputKeys, size, team_world_dup, iterations);
-    hipDeviceSynchronize();
-    double tot_time = (double)TIME_DIFF(TIME_NOW, time_start);
-    
-    double all_time = 0;
-    MPI_Allreduce(&tot_time, &all_time, 1,
-        MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-    if(my_pe == 0) {
-        printf("Avg time:\t%f\tus\n", all_time / 
-                (double)(1000.0 * iterations * num_pes));
-    }
-
-    // Verify correctness
-    if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) {
-        fprintf(stderr, "Wrong output\n");
-        return -1;
-    }
-
-    // Clean up
-    hipFree(keys);
-    hipFree(outputKeys);
-    rocshmem_free(keyBuffer1);
-    rocshmem_free(keyBuffer2);
-    rocshmem_free(sendCount);
-    rocshmem_free(recvCount);
-    rocshmem_free(sendOffset);
-    rocshmem_free(recvOffset);
-    rocshmem_finalize();
-    return 0;
-}