diff --git a/projects/rocshmem/internal/clients/shmem_rccl/CMakeLists.txt b/projects/rocshmem/internal/clients/shmem_rccl/CMakeLists.txt deleted file mode 100644 index fa07c7435f..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/CMakeLists.txt +++ /dev/null @@ -1,95 +0,0 @@ -############################################################################### -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -############################################################################### - -cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR) - -############################################################################### -# GLOBAL COMPILE FLAGS -############################################################################### -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc ) -set(CMAKE_CXX_FLAGS_DEBUG "-O0 -ggdb") - -############################################################################### -# DEFAULT BUILD TYPE -############################################################################### -if(NOT CMAKE_BUILD_TYPE) - message(STATUS "CMAKE_BUILD_TYPE unspecified: generating Release build") - - set( - CMAKE_BUILD_TYPE - "Release" - CACHE - STRING - "build type: Release, Debug, RelWithDebInfo, MinSizeRel" - FORCE - ) -endif() - -############################################################################### -# PROJECT -############################################################################### -project(rocshmem_example_driver VERSION 1.1.0 LANGUAGES CXX) - -############################################################################### -# SOURCES -############################################################################### -add_executable(${PROJECT_NAME} "") - -target_include_directories( - ${PROJECT_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} -) - -target_sources( - ${PROJECT_NAME} - PRIVATE - test_driver.cpp - tester.cpp - tester_arguments.cpp - primitive_tester.cpp -) - -############################################################################### -# ROCSHMEM -############################################################################### -find_package(hip REQUIRED) -find_package(rocshmem CONFIG REQUIRED) - -target_include_directories( - ${PROJECT_NAME} - PRIVATE - rocshmem::rocshmem -) - -target_link_libraries( - ${PROJECT_NAME} - PRIVATE - rocshmem::rocshmem - hip::host - -fgpu-rdc -# xnack allows address translation fault recovery -# required option for managed heap configs -# -mxnack -) diff --git a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/debug b/projects/rocshmem/internal/clients/shmem_rccl/build_configs/debug deleted file mode 100755 index f7cbb1967e..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/debug +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/.. - -cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -Drocshmem_DIR=$install_path/share/cmake/rocshmem \ - $src_path -cmake --build . --parallel 8 diff --git a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/release b/projects/rocshmem/internal/clients/shmem_rccl/build_configs/release deleted file mode 100755 index baa8b4277a..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/build_configs/release +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/.. - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -Drocshmem_DIR=$install_path/share/cmake/rocshmem \ - $src_path -cmake --build . --parallel 8 diff --git a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.cpp b/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.cpp deleted file mode 100644 index fc8dd91f31..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.cpp +++ /dev/null @@ -1,143 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "primitive_tester.hpp" - -#include -#include - -#include - -using namespace rocshmem; - -/****************************************************************************** - * DEVICE TEST KERNEL - *****************************************************************************/ -__global__ void -PrimitiveTest(int loop, - int *flag, - char *s_buf, - char *r_buf, - int size, - int my_pe, - ShmemContextType ctx_type) -{ - __shared__ rocshmem_ctx_t ctx; - rocshmem_wg_init(); - rocshmem_wg_ctx_create(ctx_type, &ctx); - - int block_id = hipBlockIdx_x; - for(int i =0; i< loop; i++){ - rocshmem_ctx_putmem_nbi_wg(ctx, &r_buf[my_pe*size], &s_buf[block_id * size], size, block_id); - if(hipThreadIdx_x==0){ - //rocshmem_ctx_quiet(ctx); - //rocshmem_ctx_threadfence_system(ctx); - rocshmem_ctx_int_p(ctx, &flag[my_pe], i+1, block_id); - //rocshmem_ctx_quiet(ctx); - rocshmem_int_wait_until(&flag[block_id], ROCSHMEM_CMP_EQ, i+1); - - } - __syncthreads(); - } - - rocshmem_wg_ctx_destroy(ctx); - rocshmem_wg_finalize(); -} - -/****************************************************************************** - * HOST TESTER CLASS METHODS - *****************************************************************************/ -PrimitiveTester::PrimitiveTester(TesterArguments args) - : Tester(args) -{ - flag = (int*) rocshmem_malloc(args.numprocs); - memset(flag, 0, args.numprocs*sizeof(int)); - // s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); - // r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); -} - -PrimitiveTester::~PrimitiveTester() -{ - rocshmem_free(s_buf); - rocshmem_free(r_buf); -} - -void -PrimitiveTester::resetBuffers(uint64_t size) -{ - memset(s_buf, '0', size * args.numprocs); - memset(r_buf, '1', size * args.numprocs); -} - -void -PrimitiveTester::launchKernel(dim3 gridSize, - dim3 blockSize, - int loop, - uint64_t size, - int nproc, int my_pe) -{ - - void* sendBuf = malloc(64); - void* recvBuf = malloc(64 * nproc); - - s_buf = (char *)rocshmem_malloc(size * nproc); - r_buf = (char *)rocshmem_malloc(size * nproc); - resetBuffers(size); - - MPI_Allgather(sendBuf, 64, MPI_CHAR, - recvBuf, 64, MPI_CHAR, - MPI_COMM_WORLD); - - size_t shared_bytes; - rocshmem_dynamic_shared(&shared_bytes); - - hipLaunchKernelGGL(PrimitiveTest, - gridSize, - blockSize, - shared_bytes, - stream, - loop, - flag, - s_buf, - r_buf, - size, - my_pe, - _shmem_context); - - //num_msgs = (loop + args.skip) * gridSize.x; - num_timed_msgs = loop ; -} - -void -PrimitiveTester::verifyResults(uint64_t size) -{ - int check_id =0; - if (args.myid == check_id) { - for (int i = 0; i < size*args.numprocs; i++) { - if (r_buf[i] != '0') { - fprintf(stderr, "Data validation error at idx %d\n", i); - fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0'); - exit(-1); - } - } - } -} diff --git a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.hpp b/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.hpp deleted file mode 100644 index 1c5009c190..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/primitive_tester.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef _PRIMITIVE_TESTER_HPP_ -#define _PRIMITIVE_TESTER_HPP_ - -#include "tester.hpp" -#include - -/****************************************************************************** - * HOST TESTER CLASS - *****************************************************************************/ -class PrimitiveTester : public Tester -{ - public: - explicit PrimitiveTester(TesterArguments args); - virtual ~PrimitiveTester(); - - protected: - virtual void - resetBuffers(uint64_t size) override; - - virtual void - launchKernel(dim3 gridSize, - dim3 blockSize, - int loop, - uint64_t size, - int nproc, int my_pe) override; - - virtual void - verifyResults(uint64_t size) override; - - char *s_buf = nullptr; - char *r_buf = nullptr; - int *flag = nullptr; -}; - -#endif diff --git a/projects/rocshmem/internal/clients/shmem_rccl/test_driver.cpp b/projects/rocshmem/internal/clients/shmem_rccl/test_driver.cpp deleted file mode 100644 index 511b42ac9d..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/test_driver.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include - -#include - -#include "tester.hpp" -#include "tester_arguments.hpp" - -using namespace rocshmem; - -int main(int argc, char * argv[]) -{ - /** - * Setup the tester arguments. - */ - TesterArguments args(argc, argv); - - /*** - * Select a GPU - */ - int rank = rocshmem_my_pe(); - int ndevices, my_device=0; - hipGetDeviceCount (&ndevices); - my_device = rank % ndevices; - hipSetDevice(my_device); - - /** - * Must initialize rocshmem to access arguments needed by the tester. - */ - rocshmem_init(args.num_wgs); - - /** - * Now grab the arguments from rocshmem. - */ - args.get_rocshmem_arguments(); - - /** - * Using the arguments we just constructed, call the tester factory - * method to get the tester (specified by the arguments). - */ - std::vector tests = Tester::create(args); - - /** - * Run the tests - */ - for (auto test : tests) { - test->execute(); - - /** - * The tester factory method news the tester to create it so we clean - * up the memory here. - */ - delete test; - } - - /** - * The rocshmem library needs to be cleaned up with this call. It pairs - * with the init function above. - */ - rocshmem_finalize(); - - return 0; -} diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester.cpp b/projects/rocshmem/internal/clients/shmem_rccl/tester.cpp deleted file mode 100644 index d43ee09846..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/tester.cpp +++ /dev/null @@ -1,213 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "tester.hpp" - -#include -#include -#include -#include -#include -#include - -//#include "broadcast_tester.hpp" -#include "primitive_tester.hpp" - -Tester::Tester(TesterArguments args) - : args(args) -{ - _type = (TestType) args.algorithm; - _shmem_context = args.shmem_context; - hipStreamCreate(&stream); - hipEventCreate(&start_event); - hipEventCreate(&stop_event); - hipMalloc((void**)&timer, sizeof(uint64_t) * args.num_wgs); -} - -Tester::~Tester() -{ - hipFree(timer); - hipEventDestroy(stop_event); - hipEventDestroy(start_event); - hipStreamDestroy(stream); -} - -std::vector -Tester::create(TesterArguments args) -{ - int rank = args.myid; - std::vector testers; - - if (rank == 0) - std::cout << "*** Creating Test: "; - - TestType type = (TestType) args.algorithm; - - switch (type) { - case AlltoAll_Put: - if (rank == 0) - std::cout << "AlltoAll Puts***" << std::endl; - testers.push_back(new PrimitiveTester(args)); - return testers; - case AlltoAll_Get: - if (rank == 0) - std::cout << "AlltoAll Gets***" << std::endl; - testers.push_back(new PrimitiveTester(args)); - return testers; - default: - if (rank == 0) - std::cout << "Unknown***" << std::endl; - testers.push_back(new PrimitiveTester(args)); - return testers; - } - return testers; -} - -void -Tester::execute() -{ - - int num_loops = args.loop; - - /** - * Some tests loop through data sizes in powers of 2 and report the - * results for those ranges. - */ - for (uint64_t size = args.min_msg_size; - size <= args.max_msg_size; - size <<= 1) { - - - /** - * Restricts the number of iterations of really large messages. - */ - if (size > args.large_message_size) - num_loops = args.loop_large; - - - - /** - * TODO: - * Verify that this timer type is actually uint64_t on the - * device side. - */ - memset(timer, 0, sizeof(uint64_t) * args.num_wgs); - - const dim3 blockSize(args.wg_size, 1, 1); - const dim3 gridSize(args.num_wgs, 1, 1); - - hipEventRecord(start_event, stream); - - launchKernel(gridSize, blockSize, num_loops, size, args.numprocs, args.myid); - - hipEventRecord(stop_event, stream); - hipError_t err = hipStreamSynchronize(stream); - if (err != hipSuccess) { - printf("error = %d \n", err); - } - -// rocshmem_dump_stats(); - // rocshmem_reset_stats(); - - - - // data validation - verifyResults(size); - - barrier(); - resetBuffers(size); - - print(size); - } -} - - -void -Tester::print(uint64_t size) -{ - if (args.myid != 0) { - return; - } - - // uint64_t timer_avg = timerAvgInMicroseconds(); - // double latency_avg = static_cast(timer_avg) / num_timed_msgs; - // double avg_msg_rate = num_timed_msgs / (timer_avg / 1e6); - - float total_kern_time_ms; - hipEventElapsedTime(&total_kern_time_ms, start_event, stop_event); - float total_kern_time_s = total_kern_time_ms / 1000; - double bandwidth_avg_gbs = num_timed_msgs * size * bw_factor / total_kern_time_s / pow(2, 30); - - float latency_us = (total_kern_time_ms *1000) /num_timed_msgs; - - int field_width = 20; - int float_precision = 2; - - printf("\n##### Message Size %lu #####\n", size); - - printf("%*s%*s\n", - field_width + 1, "Latency AVG (us)", - field_width + 1, "Bandwidth (GB/s)"); - - printf("%*.*f %*.*f \n", - field_width, float_precision, latency_us, - field_width, float_precision, bandwidth_avg_gbs); - - fflush(stdout); -} - -void -Tester::barrier() -{ - MPI_Barrier(MPI_COMM_WORLD); -} - -uint64_t -Tester::gpuCyclesToMicroseconds(uint64_t cycles) -{ - /** - * The dGPU asm core timer runs at 27MHz. This is different from the - * core clock returned by HIP. For an APU, this is different and might - * need adjusting. - */ - uint64_t gpu_frequency_MHz = 27; - - /** - * hipDeviceGetAttribute(&gpu_frequency_khz, - * hipDeviceAttributeClockRate, - * 0); - */ - - return cycles / gpu_frequency_MHz; -} - -uint64_t -Tester::timerAvgInMicroseconds() -{ - uint64_t sum = 0; - - for (int i = 0; i < args.num_wgs; i++) { - sum += gpuCyclesToMicroseconds(timer[i]); - } - - return sum / args.num_wgs; -} diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester.hpp b/projects/rocshmem/internal/clients/shmem_rccl/tester.hpp deleted file mode 100644 index 831cc10064..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/tester.hpp +++ /dev/null @@ -1,111 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef _TESTER_HPP_ -#define _TESTER_HPP_ - -#include - -#include - -#include "tester_arguments.hpp" - -/****************************************************************************** - * TESTER CLASS TYPES - *****************************************************************************/ -enum TestType -{ - AlltoAll_Put = 0, - AlltoAll_Get = 1 -}; - -typedef int ShmemContextType; - -/****************************************************************************** - * TESTER INTERFACE - *****************************************************************************/ -class Tester -{ - public: - explicit Tester(TesterArguments args); - virtual ~Tester(); - - void - execute(); - - static std::vector - create(TesterArguments args); - - protected: - virtual void - resetBuffers(uint64_t size) = 0; - - virtual void - preLaunchKernel() {} - - virtual void - launchKernel(dim3 gridSize, - dim3 blockSize, - int loop, - uint64_t size, - int nproc, int my_pe) = 0; - - virtual void - postLaunchKernel() {} - - virtual void - verifyResults(uint64_t size) = 0; - - int num_msgs = 0; - int num_timed_msgs = 0; - int bw_factor = 1; - - TesterArguments args; - - TestType _type; - ShmemContextType _shmem_context = 8; //SHMEM_CTX_WP_PRIVATE - - hipStream_t stream; - - uint64_t *timer = nullptr; - - private: - void - print(uint64_t size); - - void - barrier(); - - uint64_t - gpuCyclesToMicroseconds(uint64_t cycles); - - uint64_t - timerAvgInMicroseconds(); - - bool - peLaunchesKernel(); - - hipEvent_t start_event; - hipEvent_t stop_event; -}; - -#endif /* _TESTER_HPP */ diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.cpp b/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.cpp deleted file mode 100644 index 6c835169bb..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#include "tester.hpp" -#include "tester_arguments.hpp" - -#include -#include - -#include - -using namespace rocshmem; - -TesterArguments::TesterArguments(int argc, char *argv[]) -{ - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - if (arg == "-w") { - i++; - num_wgs = atoi(argv[i]); - } else if (arg == "-S") { - i++; - max_msg_size = atoll(argv[i]); - } else if (arg == "-s") { - i++; - min_msg_size = atoll(argv[i]); - } else if (arg == "-a") { - i++; - algorithm = atoi(argv[i]); - } else if (arg == "-z") { - i++; - wg_size = atoi(argv[i]); - } else if (arg == "-x") { - i++; - shmem_context = atoi(argv[i]); - } else { - show_usage(argv[0]); - exit(-1); - } - } - -} - -void -TesterArguments::show_usage(std::string executable_name) -{ - std::cout << "Usage: " << executable_name << std::endl; - std::cout << "\t-t \n"; - std::cout << "\t-w \n"; - std::cout << "\t-s \n"; - std::cout << "\t-a \n"; - std::cout << "\t-z \n"; - std::cout << "\t-c \n"; - std::cout << "\t-o \n"; - std::cout << "\t-ta \n"; - std::cout << "\t-x \n"; -} - -void -TesterArguments::get_rocshmem_arguments() -{ - numprocs = rocshmem_n_pes(); - myid = rocshmem_my_pe(); - -} diff --git a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.hpp b/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.hpp deleted file mode 100644 index 175470df5b..0000000000 --- a/projects/rocshmem/internal/clients/shmem_rccl/tester_arguments.hpp +++ /dev/null @@ -1,74 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - *****************************************************************************/ - -#ifndef _TESTER_ARGUMENTS_HPP_ -#define _TESTER_ARGUMENTS_HPP_ - -#include - -#include -#include - -class TesterArguments -{ - public: - TesterArguments(int argc, char *argv[]); - - /** - * Initialize rocshmem members - * Valid after rocshmem_init function called. - */ - void get_rocshmem_arguments(); - - private: - /** - * Output method which displays available command line options - */ - static void show_usage(std::string executable_name); - - public: - /** - * Arguments obtained from command line - */ - unsigned num_wgs = 1; - unsigned algorithm = 0; - uint64_t min_msg_size = 1; - uint64_t max_msg_size = 1 << 20; - unsigned wg_size = 64; - unsigned shmem_context = 8; // ROCSHMEM_CTX_WG_PRIVATE - - /** - * Arguments obtained from rocshmem - */ - unsigned numprocs = UINT_MAX; - unsigned myid = UINT_MAX; - - /** - * Defaults tester values - */ - int loop = 100; - int skip = 10; - int loop_large = 25; - int large_message_size = 32768; -}; - -#endif diff --git a/projects/rocshmem/internal/clients/spts/CMakeLists.txt b/projects/rocshmem/internal/clients/spts/CMakeLists.txt deleted file mode 100644 index 172c667776..0000000000 --- a/projects/rocshmem/internal/clients/spts/CMakeLists.txt +++ /dev/null @@ -1,144 +0,0 @@ -############################################################################### -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -############################################################################### - -cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR) - -############################################################################### -# GLOBAL COMPILE FLAGS -############################################################################### -set(CMAKE_CXX_STANDARD 14) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc) - -############################################################################### -# DEFAULT BUILD TYPE -############################################################################### -if(NOT CMAKE_BUILD_TYPE) - message(STATUS "CMAKE_BUILD_TYPE unspecified: generating Release build") - - set( - CMAKE_BUILD_TYPE - "Release" - CACHE - STRING - "build type: Release, Debug, RelWithDebInfo, MinSizeRel" - FORCE - ) -endif() - -############################################################################### -# PROJECT -############################################################################### -project(spts VERSION 1.1.0 LANGUAGES CXX) - -############################################################################### -# CONFIGURATION OPTIONS -############################################################################### -option(USE_HIP "Build HIP version of the solver" OFF) -option(USE_ROCSHMEM "Build rocSHMEM enabled version of the solver" OFF) -option(ALL_ANALYZE "Build analyze and solve algorithm" OFF) -option(USE_DOUBLE "Use double precision floats for the data" OFF) -option(ALL_LEVELSET "Build levelset algorithm" OFF) -option(ALL_LEVELSYNC "Build levelsync algorithm" OFF) -option(ALL_SYNCFREE "Build syncfree algorithm" OFF) - -configure_file(cmake/config.h.in config.h) - -############################################################################### -# SOURCES -############################################################################### -add_executable(${PROJECT_NAME} "") - -target_include_directories( - ${PROJECT_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - $ # CONFIG.H -) - -target_sources( - ${PROJECT_NAME} - PRIVATE - InputFlags.cpp - Main.cpp -) - -############################################################################### -# HIP / HIP + rocSHMEM -############################################################################### -if(USE_HIP) - find_package(hip REQUIRED) - - target_sources( - ${PROJECT_NAME} - PRIVATE - HIPHelper.cpp - ) - - if(USE_ROCSHMEM) - find_package(rocshmem CONFIG REQUIRED) - - target_include_directories( - ${PROJECT_NAME} - PRIVATE - rocshmem::rocshmem - ) - - target_link_libraries( - ${PROJECT_NAME} - PRIVATE - rocshmem::rocshmem - hip::host - -fgpu-rdc - ) - endif() - -############################################################################### -# OPENCL -############################################################################### -else() - - if(USE_ROCSHMEM) - message(FATAL_ERROR "Cannot use rocSHMEM without USE_HIP") - endif() - - target_sources( - ${PROJECT_NAME} - PRIVATE - OpenCLHelper.cpp - ) - - target_include_directories( - ${PROJECT_NAME} - PRIVATE - /opt/rocm/opencl/include - ) - - target_link_libraries( - ${PROJECT_NAME} - PRIVATE - -L/opt/rocm/opencl/lib/x86_64 - -lOpenCL -) - -endif() diff --git a/projects/rocshmem/internal/clients/spts/GPUHelper.h b/projects/rocshmem/internal/clients/spts/GPUHelper.h deleted file mode 100644 index 7773726568..0000000000 --- a/projects/rocshmem/internal/clients/spts/GPUHelper.h +++ /dev/null @@ -1,85 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ -#ifndef GPUHelper_H -#define GPUHelper_H - -#include "config.h" - -#include -#include -#include -#include "InputFlags.h" - -#define ROW_BITS 32 // May be not the right place to define this macro -#define WG_BITS 24 - -static int SPTS_BLOCK_SIZE = 0; - -#ifdef USE_ROCSHMEM -#define WF_PER_WG 1 -#else -#define WF_PER_WG 16 -#endif -#define WF_SIZE 64 - -#ifdef USE_HIP - #include - typedef void * memPointer; - typedef int memPointer_flags; - typedef int gpuInt; - typedef bool gpuBool; - typedef hipEvent_t gpuEvent; - typedef hipError_t gpuError; - #define GPU_MEM_READ_ONLY 0 - #define GPU_MEM_READ_WRITE 0 - #define GPU_MEM_USE_HOST_PTR 0 - #define GPU_TRUE true - #define GPU_FALSE false -#else -#include - typedef cl_mem memPointer; - typedef cl_mem_flags memPointer_flags; - typedef cl_int gpuInt; - typedef cl_bool gpuBool; - typedef cl_event gpuEvent; - typedef cl_int gpuError; - #define GPU_MEM_READ_ONLY CL_MEM_READ_ONLY - #define GPU_MEM_READ_WRITE CL_MEM_READ_ONLY - #define GPU_MEM_USE_HOST_PTR CL_MEM_USE_HOST_PTR - #define GPU_TRUE CL_TRUE - #define GPU_FALSE CL_FALSE -#endif - -class GPUHelper -{ - public: - GPUHelper() {} - virtual int Init(const std::string &_filename, InputFlags &in_flags) = 0; - virtual void checkStatus(gpuError status, const std::string errString) = 0; - virtual void CopyToDevice(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev) = 0; - virtual void CopyToHost(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev) = 0; - virtual memPointer AllocateMem(const std::string name, size_t, memPointer_flags flags, void *) = 0; - virtual void FreeMem(memPointer ptr) = 0; - virtual void Flush() = 0; -}; - -#endif //GPUHelper_H diff --git a/projects/rocshmem/internal/clients/spts/HIPHelper.cpp b/projects/rocshmem/internal/clients/spts/HIPHelper.cpp deleted file mode 100644 index 57e94e655b..0000000000 --- a/projects/rocshmem/internal/clients/spts/HIPHelper.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ - -#include "HIPHelper.h" -#include -#include -#include - -int HIPHelper::Init(const std::string &filename, InputFlags &in_flags) -{ - int device = 0; - hipSetDevice(device); - hipDeviceProp_t props; - hipGetDeviceProperties(&props, device /*deviceID*/); - printf("info: running on device %s\n", props.name); - printf("info: architecture on AMD GPU device is: %d\n", props.gcnArch); - - return 0; -} - -void HIPHelper::checkStatus(gpuError status, const std::string errString) -{ - if (status != HIP_SUCCESS) - { - std::cerr << errString << " : " << hipGetErrorString(status) << std::endl; - exit(-1); - } -} - -memPointer HIPHelper::AllocateMem(const std::string name, - size_t size, - memPointer_flags flags, - void *hostBuffer) -{ - void* buf; - std::string errString = "HIP error allocating " + name + " !"; - checkStatus(hipMalloc(&buf, size), errString); - printf("Allocating %s of size %zu at buf %p\n", name.c_str(), size, buf); - return buf; -} - -void HIPHelper::CopyToDevice(memPointer devBuffer, - void *hostBuffer, - size_t size, - size_t offset, - gpuBool blocking, - gpuEvent *ev) -{ - assert(offset == 0); - memcpy(devBuffer, hostBuffer, size); -/* - if (blocking == GPU_TRUE) { - checkStatus(hipMemcpy(devBuffer, hostBuffer, size, hipMemcpyHostToDevice), - "HIP error copying data to device !"); - } else { - checkStatus(hipMemcpyAsync(devBuffer, hostBuffer, size, hipMemcpyHostToDevice), - "HIP error copying data to device !"); - } -*/ -} - -void HIPHelper::CopyToHost(memPointer devBuffer, - void *hostBuffer, - size_t size, - size_t offset, - gpuBool blocking, - gpuEvent *ev) -{ - assert(offset == 0); -memcpy(hostBuffer, devBuffer, size); -/* - if (blocking == GPU_TRUE) { - checkStatus(hipMemcpy(hostBuffer, devBuffer, size, hipMemcpyDeviceToHost), - "HIP error copying data to device !"); - } else { - checkStatus(hipMemcpyAsync(hostBuffer, devBuffer, size, hipMemcpyDeviceToHost), - "HIP error copying data to device !"); - } -*/ -} diff --git a/projects/rocshmem/internal/clients/spts/HIPHelper.h b/projects/rocshmem/internal/clients/spts/HIPHelper.h deleted file mode 100644 index b7e1de1bd4..0000000000 --- a/projects/rocshmem/internal/clients/spts/HIPHelper.h +++ /dev/null @@ -1,50 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ -#ifndef CLHelper_H -#define CLHelper_H - -#define CL_USE_DEPRECATED_OPENCL_2_0_APIS -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS - -#include -#include -#include -#include "InputFlags.h" -#include "GPUHelper.h" -#include "hip/hip_runtime.h" - -class HIPHelper : public GPUHelper -{ - public: - HIPHelper() {} - int Init(const std::string &_filename, InputFlags &in_flags); - void checkStatus(gpuError status, const std::string errString); - void CopyToDevice(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev); - void CopyToHost(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, gpuBool _blocking, gpuEvent *_ev); - memPointer AllocateMem(const std::string name, size_t, memPointer_flags flags, void *); - void FreeMem(memPointer ptr) { hipFree(ptr); } - void Flush() { hipDeviceSynchronize(); } -}; - -#endif //CLHelper_H - diff --git a/projects/rocshmem/internal/clients/spts/InputFlags.cpp b/projects/rocshmem/internal/clients/spts/InputFlags.cpp deleted file mode 100644 index 262d58d15b..0000000000 --- a/projects/rocshmem/internal/clients/spts/InputFlags.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ -#include -#include -#include -#include "InputFlags.h" - -InputFlags::InputFlags() -{ - AddInputFlag("help", 'h', "", "Print Help Message", "string"); -} - -void InputFlags::AddInputFlag(const std::string &_long_name, - char _short_name, - const std::string &_value, - const std::string &_help_text, - const std::string &_type) -{ - Input in; - in.long_name = _long_name; - in.short_name = _short_name; - in.value = _value; - in.help_text = _help_text; - in.type = _type; - - if(MapInputs.count(_short_name) > 0) - printf("Input flag: %s (%c) already exists !", _long_name.c_str(), _short_name); - else - MapInputs[_short_name] = in; -} - -void InputFlags::Print() -{ - printf("SpTS Input Flags: \n\n"); - - for(auto &content : MapInputs) - std::cout< args; - for(int i = 1; i < argc; i++) - args.push_back(argv[i]); - - if(args.size() == 0) // No Input Flag - Print(); - - for(int i = 0; i < args.size(); i++) - { - std::string temp = args[i]; - if(temp[0] != '-') - { - printf("Illegal input flag\n"); - Print(); - } - else if(temp[0] == '-' && temp[1] == '-') // Long Name Input - { - std::string long_name = temp.substr(2); - if(long_name == "help") - Print(); - - char short_name = FindShortName(long_name); - - if (short_name == 'n' || short_name == 'z' || short_name == 'v') - { - MapInputs[short_name].value = "true"; - } - else - { - MapInputs[short_name].value = args[i+1]; - i++; - } - } - else if (temp[0] == '-' && temp[1] == '?') // Help Input - Print(); - else // Short Name Input - { - char short_name = temp[1]; - if(MapInputs.find(short_name) == MapInputs.end()) - { - std::cout<<"Input Flag: "< -#include - -struct Input -{ - std::string long_name; - char short_name; - std::string value; - std::string help_text; - std::string type; -}; - -class InputFlags -{ - std::map MapInputs; - - public: - InputFlags(); - virtual void AddDerivedInputFlags() = 0; - void AddInputFlag(const std::string &_long_name, - char _short_name, - const std::string &_value, - const std::string &_help_text, - const std::string &type); - void Parse(int argc, char *argv[]); - char FindShortName(const std::string &long_name); - void Print(); - - std::string GetValueStr(const std::string &long_name); - int GetValueInt(const std::string &long_name); - uint64_t GetValueUint64(const std::string &long_name); - float GetValueFloat(const std::string &long_name); - bool GetValueBool(const std::string &long_name); - - virtual ~InputFlags() {} -}; - -#endif //InputFlags_H diff --git a/projects/rocshmem/internal/clients/spts/LICENSE b/projects/rocshmem/internal/clients/spts/LICENSE deleted file mode 100644 index b1a3ae16d2..0000000000 --- a/projects/rocshmem/internal/clients/spts/LICENSE +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/projects/rocshmem/internal/clients/spts/Main.cpp b/projects/rocshmem/internal/clients/spts/Main.cpp deleted file mode 100644 index 429e1a242c..0000000000 --- a/projects/rocshmem/internal/clients/spts/Main.cpp +++ /dev/null @@ -1,193 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ - -#include "config.h" - -#ifdef USE_HIP -#include "hip/hip_runtime.h" -#else -#include "OpenCLHelper.h" -#endif - -#ifdef USE_RO_SHMEM -#include "mpi.h" -#endif - -#include "MatrixMarketReader.h" -#include "SpTS.h" -#include -#include -#include - -#ifdef USE_DOUBLE -typedef double FPTYPE; -#else -typedef float FPTYPE; -#endif - -using namespace rocshmem; - -int main(int argc, char *argv[]) -{ - SparseTriangularSolve spts_obj; - InputFlags &in_flags = spts_obj; - in_flags.AddDerivedInputFlags(); - in_flags.Parse(argc, argv); - FPTYPE alpha = in_flags.GetValueFloat("alpha"); - - printf("Reading input file: %s...", in_flags.GetValueStr("filename").c_str());fflush(stdout); - MatrixMarketReader mm_reader; - if (mm_reader.MMReadFormat(in_flags.GetValueStr("filename"), in_flags) != 0) - { - fprintf(stderr, "ERROR reading input file !\n"); - exit(1); - } - printf("Done.\n"); - - GPUHelper *GPU; -#ifdef USE_HIP - printf("Initializing HIP runtime...\n\t");fflush(stdout); - GPU = new HIPHelper(); - char buf[PATH_MAX + 1]; - readlink("/proc/self/exe", buf, sizeof(buf) - 1); - std::string str(buf); - printf("Going to try to open %s\n", (str.substr(0, str.rfind('/'))+"/spts_kernel.cl").c_str()); - if(GPU->Init((str.substr(0, str.rfind('/'))+ "/spts_kernel.cl").c_str(), in_flags) == 1) - { - fflush(stdout); - fprintf(stderr,"\nError Initializing HIP Runtime !\n"); - exit(-1); - } -#else - printf("Initializing OpenCL runtime...\n\t");fflush(stdout); - GPU = new CLHelper(); - char buf[PATH_MAX + 1]; - readlink("/proc/self/exe", buf, sizeof(buf) - 1); - std::string str(buf); - printf("Going to try to open %s\n", (str.substr(0, str.rfind('/'))+"/spts_kernel.cl").c_str()); - if(GPU->Init((str.substr(0, str.rfind('/'))+ "/spts_kernel.cl").c_str(), in_flags) == 1) - { - fflush(stdout); - fprintf(stderr,"\nError Initializing OpenCL Runtime !\n"); - exit(-1); - } -#endif - printf("Done.\n"); - - printf("Allocating sparse matrices...");fflush(stdout); - spts_obj.AllocateSparseMatrix(mm_reader, in_flags, GPU); - printf("Done.\n"); - - printf("Converting COO to CSR...");fflush(stdout); - spts_obj.ConvertFromCOOToCSR(mm_reader.GetCoordinates(), in_flags); - printf("Done.\n"); - - SPTS_BLOCK_SIZE = in_flags.GetValueInt("block_size"); - printf("Finding Stats For Parallel Decomposition...");fflush(stdout); - spts_obj.FindStatsForParallelDecomposition(); - printf("Done.\n"); - - printf("Allocating parallel sparse matrices...");fflush(stdout); - spts_obj.AllocateParallelSparseMatrix(mm_reader, in_flags); - printf("Done.\n"); - - printf("Allocating vectors...");fflush(stdout); - spts_obj.AllocateVectors(mm_reader); - printf("Done.\n"); - - float gflops = 0.f; - int errors = 0; - uint64_t ns_per_iter = 0; - uint64_t ns_per_analysis_iter = 0; - uint64_t ns_per_syncfree_iter = 0; - uint64_t ns_per_levelset_iter = 0; - uint64_t ns_per_levelsync_iter = 0; - - printf("Performing SpTS on the CPU with alpha=%f...", (float)alpha);fflush(stdout); - spts_obj.CSRSpTSCPU(alpha); - printf("Done.\n"); - - printf("Checking results of CPU-side SpTS...");fflush(stdout); - if (!spts_obj.CSRCheckCPU(alpha)) - { - fflush(stdout); - fprintf(stderr, "CPU-based results were 'wrong', likely due to FP rounding. Expect the CPU and GPU to differ wildly.\n"); - //exit(-1); - } - printf("Done.\n"); - - printf("Performing %d iterations of SpTS on the GPU with alpha=%f...", in_flags.GetValueInt("iterations"), (float)alpha);fflush(stdout); - gflops = spts_obj.CSRSpTSGPU(ns_per_iter, ns_per_analysis_iter, ns_per_syncfree_iter, ns_per_levelset_iter, ns_per_levelsync_iter, alpha); - printf("Done.\n"); - - if (in_flags.GetValueBool("verify")) { - printf("Checking whether GPU SpTS caused non-deterministic errors...\n");fflush(stdout); - int non_det_errors = spts_obj.NonDeterministicErrors(); - printf("Done.\n"); - if (non_det_errors) - fprintf(stderr, "ERROR!! -- Saw %d GPU iterations that had non-deterministic differences.\n", non_det_errors); - int max_errors = spts_obj.MaxErrors(); - if (max_errors) - { - if (max_errors > 1) - printf(" -- %d rows differed between CPU and GPU results.\n", max_errors); - else - printf(" -- %d row differed between CPU and GPU results.\n", max_errors); - } - else - printf("\n"); - } - - printf("File %s : SpTS Gflops: %f ms_per_iter: %lf ", in_flags.GetValueStr("filename").c_str(), gflops, ((double)ns_per_iter/1000000.)); - printf(" ( ms_per_analysis_iter: "); - if (ns_per_analysis_iter == 0) - printf("no_iter"); - else - printf("%lf", ((double)ns_per_analysis_iter/1000000.)); - printf(" | ms_per_syncfree_iter: "); - if (ns_per_syncfree_iter == 0) - printf("no_iter"); - else - printf("%lf", ((double)ns_per_syncfree_iter/1000000.)); - printf(" | ms_per_levelset_iter: "); - if (ns_per_levelset_iter == 0) - printf("no_iter"); - else - printf("%lf", ((double)ns_per_levelset_iter/1000000.)); - printf(" | ms_per_levelsync_iter: "); - if (ns_per_levelsync_iter == 0) - printf("no_iter )"); - else - printf("%lf )", ((double)ns_per_levelsync_iter/1000000.)); - -#ifdef USE_ROCSHMEM - MPI_Allreduce(MPI_IN_PLACE, (void *) &ns_per_analysis_iter, 1, - MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD); - - if (spts_obj.Get_this_pe() == 0) { - printf("\nRANK 0: analysis avg ms = %lf\n", - ((double) ns_per_analysis_iter / 1000000.) / spts_obj.Get_total_pes()); - } -#endif - - return 0; -} diff --git a/projects/rocshmem/internal/clients/spts/MatrixMarketReader.h b/projects/rocshmem/internal/clients/spts/MatrixMarketReader.h deleted file mode 100644 index 00403b64ac..0000000000 --- a/projects/rocshmem/internal/clients/spts/MatrixMarketReader.h +++ /dev/null @@ -1,377 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ -#ifndef MatrixMarketReader_H -#define MatrixMarketReader_H -/* -Portions of this file include code provided by The National Institute of -Standards and Technology (NIST). The code includes -macro definitions from mmio.h and is subject to the following disclaimer. - -Software Disclaimer - -NIST-developed software is provided by NIST as a public service. You may use, -copy and distribute copies of the software in any medium, provided that you -keep intact this entire notice. You may improve, modify and create derivative -works of the software or any portion of the software, and you may copy and -distribute such modifications or works. Modified works should carry a notice -stating that you changed the software and should note the date and nature of -any such change. Please explicitly acknowledge the National Institute of -Standards and Technology as the source of the software. - -NIST-developed software is expressly provided "AS IS" NIST MAKES NO WARRANTY -OF ANY KIND, EXPRESS, IMPLIED, IN FACT OR ARISING BY OPERATION OF LAW, -INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT AND DATA ACCURACY. NIST -NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE -UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES -NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR -THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, -RELIABILITY, OR USEFULNESS OF THE SOFTWARE. - -You are solely responsible for determining the appropriateness of using and -distributing the software and you assume all risks associated with its use, -including but not limited to the risks and costs of program errors, compliance -with applicable laws, damage to or loss of data, programs or equipment, and -the unavailability or interruption of operation. This software is not intended -to be used in any situation where a failure could cause risk of injury or -damage to property. The software developed by NIST employees is not subject -to copyright protection within the United States. -*/ - -#include -#include -#include -#include -#include -#include "InputFlags.h" -#include -#include "mmio.h" - -// Class declaration - -template -struct Coordinate { - int x; - int y; - FloatType val; -}; - -template -class MatrixMarketReader -{ - char Typecode[4]; - int nNZ; - int nRows; - int nCols; - int isSymmetric; - int isDoubleMem; - Coordinate *coords; - bool *has_seen_diag; - - public: - MatrixMarketReader() : nNZ(0), nRows(0), nCols(0), isSymmetric(0), isDoubleMem(0) - { - for (int i = 0; i < sizeof(Typecode); i++) - Typecode[i] = '\0'; - coords = NULL; - } - bool MMReadFormat(const std::string &_filename, InputFlags &_in_flags); - bool MMReadBanner(FILE *_infile); - bool MMReadMtxCrdSize(FILE *_infile); - void MMGenerateCOOFromFile(FILE *_infile, InputFlags &_in_flags); - - int GetNumRows() { return nRows; } - int GetNumCols() { return nCols; } - int GetNumNonZeroes() { return nNZ; } - int GetSymmetric() { return isSymmetric; } - - char *GetTypecode() { return Typecode; } - Coordinate *GetCoordinates() { return coords; } - - ~MatrixMarketReader() - { - delete[] coords; - } -}; - -// Class definition - -template -bool MatrixMarketReader::MMReadFormat(const std::string &filename, InputFlags &in_flags) -{ - FILE *mm_file = fopen(filename.c_str(), "r"); - if( mm_file == NULL) - { - printf("Cannot Open Matrix-Market File !\n"); - return 1; - } - - int status = MMReadBanner(mm_file); - if(status != 0) - { - printf("Error Reading Banner in Matrix-Market File !\n"); - return 1; - } - - if(! mm_is_coordinate(Typecode)) - {printf(" only handling coordinate format\n"); return(1);} - - if(mm_is_complex(Typecode)) { - printf("Error: cannot handle complex format\n"); - return (1); - } - - if(mm_is_symmetric(Typecode)) - isSymmetric = 1; - - status = MMReadMtxCrdSize(mm_file); - if(status != 0) { - printf("Error reading Matrix Market crd_size %d\n",status); - return(1); - } - - if(mm_is_symmetric(Typecode)) - coords = new Coordinate[nNZ+nRows]; - else if (in_flags.GetValueBool("non_symmetric")) - coords = new Coordinate[nNZ+nRows]; // This is too large, but oh well. - else - { - fprintf(stderr, "Error: Input matrix is NOT symmetric. This will not work for SpTS.\n"); - return (1); - } - - has_seen_diag = new bool[nRows]; - for (int i = 0; i < nRows; i++) - has_seen_diag[i] = false; - - MMGenerateCOOFromFile(mm_file, in_flags); - return 0; -} - -template -void FillCoordData(char Typecode[], - Coordinate *coords, - bool *has_seen_diag, - int &actual_nnz, - int ir, - int ic, - FloatType val) -{ - int new_x = ir - 1; - int new_y = ic - 1; - if (new_y > new_x) - { - // Skip stuff in the upper diagonal - // Just keep our lower diag. - return; - } - if (new_y == new_x) - has_seen_diag[new_x] = true; - coords[actual_nnz].x = new_x; - coords[actual_nnz].y = new_y; - coords[actual_nnz ++].val = val; -} - -template -void FixupMissingDiags(char Typecode[], - Coordinate *coords, - int &actual_nnz, - int nRows, - bool *has_seen_diag, - InputFlags &in_flags) -{ - for(int i = 0; i < nRows; i++) - { - if (has_seen_diag[i] == false) - { - coords[actual_nnz].x = i; - coords[actual_nnz].y = i; - coords[actual_nnz ++].val = 1.; - } - } -} - -template -void MatrixMarketReader::MMGenerateCOOFromFile(FILE *infile, - InputFlags &in_flags) -{ - int actual_nnz = 0; - FloatType val; - int ir, ic; - - int exp_zeroes = in_flags.GetValueBool("exp_zeroes"); - - for(int i = 0; i < nNZ; i++) - { - if(mm_is_real(Typecode)) - { - if(typeid(FloatType) == typeid(float)) - fscanf(infile, "%d %d %f\n", &ir, &ic, (float*)(&val)); - else if(typeid(FloatType) == typeid(double)) - fscanf(infile, "%d %d %lf\n", &ir, &ic, (double*)(&val)); - - if(exp_zeroes == 0 && val == 0) - continue; - else - FillCoordData(Typecode, coords, has_seen_diag, actual_nnz, ir, ic, val); - } - else if (mm_is_integer(Typecode)) - { - if(typeid(FloatType) == typeid(float)) - fscanf(infile, "%d %d %f\n", &ir, &ic, (float*)(&val)); - else if(typeid(FloatType) == typeid(double)) - fscanf(infile, "%d %d %lf\n", &ir, &ic, (double*)(&val)); - - if(exp_zeroes == 0 && val == 0) - continue; - else - FillCoordData(Typecode, coords, has_seen_diag, actual_nnz, ir, ic, val); - - } - else if(mm_is_pattern(Typecode)) - { - fscanf(infile, "%d %d", &ir, &ic); - //val = ((FloatType) MAX_RAND_VAL * (rand() / (RAND_MAX + 1.0))); - val = 3.; - - if(exp_zeroes == 0 && val == 0) - continue; - else - FillCoordData(Typecode, coords, has_seen_diag, actual_nnz, ir, ic, val); - } - } - FixupMissingDiags(Typecode, coords, actual_nnz, nRows, has_seen_diag, in_flags); - nNZ = actual_nnz; - printf("\n\tNNZ in the lower triangular and fixedup diagonal: %d\n", nNZ); -} - -template -bool MatrixMarketReader::MMReadBanner(FILE *infile) -{ - char line[MM_MAX_LINE_LENGTH]; - char banner[MM_MAX_TOKEN_LENGTH]; - char mtx[MM_MAX_TOKEN_LENGTH]; - char crd[MM_MAX_TOKEN_LENGTH]; - char data_type[MM_MAX_TOKEN_LENGTH]; - char storage_scheme[MM_MAX_TOKEN_LENGTH]; - char *p; - - mm_clear_typecode(Typecode); - - if (fgets(line, MM_MAX_LINE_LENGTH, infile) == NULL) - return MM_PREMATURE_EOF; - - if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, - storage_scheme) != 5) - return MM_PREMATURE_EOF; - - for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */ - for (p=crd; *p!='\0'; *p=tolower(*p),p++); - for (p=data_type; *p!='\0'; *p=tolower(*p),p++); - for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++); - - /* check for banner */ - if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0) - return MM_NO_HEADER; - - /* first field should be "mtx" */ - if (strcmp(mtx, MM_MTX_STR) != 0) - return MM_UNSUPPORTED_TYPE; - mm_set_matrix(Typecode); - - - /* second field describes whether this is a sparse matrix (in coordinate - storgae) or a dense array */ - - - if (strcmp(crd, MM_SPARSE_STR) == 0) - mm_set_sparse(Typecode); - else if (strcmp(crd, MM_DENSE_STR) == 0) - mm_set_dense(Typecode); - else - return MM_UNSUPPORTED_TYPE; - - - /* third field */ - - if (strcmp(data_type, MM_REAL_STR) == 0) - mm_set_real(Typecode); - else - if (strcmp(data_type, MM_COMPLEX_STR) == 0) - mm_set_complex(Typecode); - else - if (strcmp(data_type, MM_PATTERN_STR) == 0) - mm_set_pattern(Typecode); - else - if (strcmp(data_type, MM_INT_STR) == 0) - mm_set_integer(Typecode); - else - return MM_UNSUPPORTED_TYPE; - - - /* fourth field */ - - if (strcmp(storage_scheme, MM_GENERAL_STR) == 0) - mm_set_general(Typecode); - else - if (strcmp(storage_scheme, MM_SYMM_STR) == 0) - mm_set_symmetric(Typecode); - else - if (strcmp(storage_scheme, MM_HERM_STR) == 0) - mm_set_hermitian(Typecode); - else - if (strcmp(storage_scheme, MM_SKEW_STR) == 0) - mm_set_skew(Typecode); - else - return MM_UNSUPPORTED_TYPE; - - return 0; - -} - -template -bool MatrixMarketReader::MMReadMtxCrdSize(FILE *infile) -{ - char line[MM_MAX_LINE_LENGTH]; - int num_items_read; - - /* now continue scanning until you reach the end-of-comments */ - do - { - if (fgets(line,MM_MAX_LINE_LENGTH, infile) == NULL) - return MM_PREMATURE_EOF; - }while (line[0] == '%'); - - /* line[] is either blank or has M,N, nz */ - if (sscanf(line, "%d %d %d", &nRows, &nCols, &nNZ) == 3) - return 0; - else - do - { - num_items_read = fscanf(infile, "%d %d %d", &nRows, &nCols, &nNZ); - if (num_items_read == EOF) return MM_PREMATURE_EOF; - } - while (num_items_read != 3); - - return 0; -} -#endif // MatrixMarketReader_H diff --git a/projects/rocshmem/internal/clients/spts/OpenCLHelper.cpp b/projects/rocshmem/internal/clients/spts/OpenCLHelper.cpp deleted file mode 100644 index d4d45f269f..0000000000 --- a/projects/rocshmem/internal/clients/spts/OpenCLHelper.cpp +++ /dev/null @@ -1,486 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ -#include "OpenCLHelper.h" -#include -#include -#include - -cl_context CLHelper::context = NULL; -cl_command_queue CLHelper::commandQueue = NULL; -cl_kernel CLHelper::SpTSKernel = NULL; -cl_kernel CLHelper::SpTSKernel_analyze = NULL; -cl_kernel CLHelper::SpTSKernel_levelset = NULL; -cl_kernel CLHelper::SpTSKernel_scalar = NULL; -cl_kernel CLHelper::SpTSKernel_vector = NULL; -cl_kernel CLHelper::SpTSKernel_levelsync = NULL; - -const char * get_cl_err_string(cl_int err) -{ - switch (err) - { - case CL_SUCCESS: - return "CL_SUCCESS"; - case CL_DEVICE_NOT_FOUND: - return "CL_DEVICE_NOT_FOUND"; - case CL_DEVICE_NOT_AVAILABLE: - return "CL_DEVICE_NOT_AVAILABLE"; - case CL_COMPILER_NOT_AVAILABLE: - return "CL_COMPILER_NOT_AVAILABLE"; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case CL_OUT_OF_RESOURCES: - return "CL_OUT_OF_RESOURCES"; - case CL_OUT_OF_HOST_MEMORY: - return "CL_OUT_OF_HOST_MEMORY"; - case CL_PROFILING_INFO_NOT_AVAILABLE: - return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case CL_MEM_COPY_OVERLAP: - return "CL_MEM_COPY_OVERLAP"; - case CL_IMAGE_FORMAT_MISMATCH: - return "CL_IMAGE_FORMAT_MISMATCH"; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case CL_BUILD_PROGRAM_FAILURE: - return "CL_BUILD_PROGRAM_FAILURE"; - case CL_MAP_FAILURE: - return "CL_MAP_FAILURE"; -#ifdef CL_VERSION_1_1 - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; -#endif -#ifdef CL_VERSION_1_2 - case CL_COMPILE_PROGRAM_FAILURE: - return "CL_COMPILE_PROGRAM_FAILURE"; - case CL_LINKER_NOT_AVAILABLE: - return "CL_LINKER_NOT_AVAILABLE"; - case CL_LINK_PROGRAM_FAILURE: - return "CL_LINK_PROGRAM_FAILURE"; - case CL_DEVICE_PARTITION_FAILED: - return "CL_DEVICE_PARTITION_FAILED"; - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; -#endif - case CL_INVALID_VALUE: - return "CL_INVALID_VALUE"; - case CL_INVALID_DEVICE_TYPE: - return "CL_INVALID_DEVICE_TYPE"; - case CL_INVALID_PLATFORM: - return "CL_INVALID_PLATFORM"; - case CL_INVALID_DEVICE: - return "CL_INVALID_DEVICE"; - case CL_INVALID_CONTEXT: - return "CL_INVALID_CONTEXT"; - case CL_INVALID_QUEUE_PROPERTIES: - return "CL_INVALID_QUEUE_PROPERTIES"; - case CL_INVALID_COMMAND_QUEUE: - return "CL_INVALID_COMMAND_QUEUE"; - case CL_INVALID_HOST_PTR: - return "CL_INVALID_HOST_PTR"; - case CL_INVALID_MEM_OBJECT: - return "CL_INVALID_MEM_OBJECT"; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case CL_INVALID_IMAGE_SIZE: - return "CL_INVALID_IMAGE_SIZE"; - case CL_INVALID_SAMPLER: - return "CL_INVALID_SAMPLER"; - case CL_INVALID_BINARY: - return "CL_INVALID_BINARY"; - case CL_INVALID_BUILD_OPTIONS: - return "CL_INVALID_BUILD_OPTIONS"; - case CL_INVALID_PROGRAM: - return "CL_INVALID_PROGRAM"; - case CL_INVALID_PROGRAM_EXECUTABLE: - return "CL_INVALID_PROGRAM_EXECUTABLE"; - case CL_INVALID_KERNEL_NAME: - return "CL_INVALID_KERNEL_NAME"; - case CL_INVALID_KERNEL_DEFINITION: - return "CL_INVALID_KERNEL_DEFINITION"; - case CL_INVALID_KERNEL: - return "CL_INVALID_KERNEL"; - case CL_INVALID_ARG_INDEX: - return "CL_INVALID_ARG_INDEX"; - case CL_INVALID_ARG_VALUE: - return "CL_INVALID_ARG_VALUE"; - case CL_INVALID_ARG_SIZE: - return "CL_INVALID_ARG_SIZE"; - case CL_INVALID_KERNEL_ARGS: - return "CL_INVALID_KERNEL_ARGS"; - case CL_INVALID_WORK_DIMENSION: - return "CL_INVALID_WORK_DIMENSION"; - case CL_INVALID_WORK_GROUP_SIZE: - return "CL_INVALID_WORK_GROUP_SIZE"; - case CL_INVALID_WORK_ITEM_SIZE: - return "CL_INVALID_WORK_ITEM_SIZE"; - case CL_INVALID_GLOBAL_OFFSET: - return "CL_INVALID_GLOBAL_OFFSET"; - case CL_INVALID_EVENT_WAIT_LIST: - return "CL_INVALID_EVENT_WAIT_LIST"; - case CL_INVALID_EVENT: - return "CL_INVALID_EVENT"; - case CL_INVALID_OPERATION: - return "CL_INVALID_OPERATION"; - case CL_INVALID_GL_OBJECT: - return "CL_INVALID_GL_OBJECT"; - case CL_INVALID_BUFFER_SIZE: - return "CL_INVALID_BUFFER_SIZE"; -#ifdef CL_VERSION_1_1 - case CL_INVALID_MIP_LEVEL: - return "CL_INVALID_MIP_LEVEL"; - case CL_INVALID_GLOBAL_WORK_SIZE: - return "CL_INVALID_GLOBAL_WORK_SIZE"; - case CL_INVALID_PROPERTY: - return "CL_INVALID_PROPERTY"; -#ifdef cl_ext_device_fission - case CL_DEVICE_PARTITION_FAILED_EXT: - return "CL_DEVICE_PARTITION_FAILED_EXT"; - case CL_INVALID_PARTITION_COUNT_EXT: - return "CL_INVALID_PARTITION_COUNT_EXT"; - case CL_INVALID_PARTITION_NAME_EXT: - return "CL_INVALID_PARTITION_NAME_EXT"; -#endif -#endif -#ifdef CL_VERSION_1_2 - case CL_INVALID_IMAGE_DESCRIPTOR: - return "CL_INVALID_IMAGE_DESCRIPTOR"; - case CL_INVALID_COMPILER_OPTIONS: - return "CL_INVALID_COMPILER_OPTIONS"; - case CL_INVALID_LINKER_OPTIONS: - return "CL_INVALID_LINKER_OPTIONS"; - case CL_INVALID_DEVICE_PARTITION_COUNT: - return "CL_INVALID_DEVICE_PARTITION_COUNT"; -#endif -#ifdef CL_VERSION_2_0 - case CL_INVALID_PIPE_SIZE: - return "CL_INVALID_PIPE_SIZE"; - case CL_INVALID_DEVICE_QUEUE: - return "CL_INVALID_DEVICE_QUEUE"; -#endif -#ifdef CL_VERSION_2_2 - case CL_INVALID_SPEC_ID: - return "CL_INVALID_SPEC_ID"; - case CL_MAX_SIZE_RESTRICTION_EXCEEDED: - return "CL_MAX_SIZE_RESTRICTION_EXCEEDED"; -#endif -#ifdef cl_khr_icd - case CL_PLATFORM_NOT_FOUND_KHR: - return "CL_PLATFORM_NOT_FOUND_KHR"; -#endif - default: - return "UNKNOWN CL ERROR CODE"; - } -} - -void convertToStr(char **source, size_t* sourceSize, const std::string fname) -{ - FILE *fp = fopen(fname.c_str(), "r"); - fseek(fp, 0, SEEK_END); - *sourceSize = ftell(fp); - fseek(fp , 0, SEEK_SET); - *source = (char *)malloc(*sourceSize * sizeof(char)); - fread(*source, 1, *sourceSize, fp); - fclose(fp); - -} - -int CLHelper::Init(const std::string &filename, InputFlags &in_flags) -{ - cl_int status = 0; - size_t deviceListSize; - unsigned int i; - - /* - * Have a look at the available platforms and pick either - * the AMD one if available or a reasonable default. - */ - cl_uint numPlatforms; - platform = NULL; - status = clGetPlatformIDs(0, NULL, &numPlatforms); - if(status != CL_SUCCESS) - { - fprintf(stderr,"clGetPlatformIDs failed. %u",numPlatforms); - return 1; - } - if (0 < numPlatforms) - { - cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); - status = clGetPlatformIDs(numPlatforms, platforms, NULL); - if(status != CL_SUCCESS) - { - fprintf(stderr, "clGetPlatformIDs failed: %s\n", get_cl_err_string(status) ); - return 1; - } - for (i = 0; i < numPlatforms; ++i) - { - char pbuf[100]; - status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); - - if(status != CL_SUCCESS) - { - fprintf(stderr,"clGetPlatformInfo failed: %s\n", get_cl_err_string(status)); - return 1; - } - - platform = platforms[i]; - if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) - { - break; - } - } - free(platforms); - } - - ///////////////////////////////////////////////////////////////// - // Create an OpenCL context - ///////////////////////////////////////////////////////////////// - - cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; - cl_context_properties* cprops = (NULL == platform) ? NULL : cps; - context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); - if(status != CL_SUCCESS) - { - printf("status: %d", status); - fprintf(stderr,"Error: Creating Context. (clCreateContextFromType): %s\n", get_cl_err_string(status)); - return 1; - } - /* First, get the size of device list data */ - status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(size_t), &deviceListSize, NULL); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Getting Context Info (device list size, clGetContextInfo): %s\n", get_cl_err_string(status)); - return 1; - } - - ///////////////////////////////////////////////////////////////// - // Detect OpenCL devices - ///////////////////////////////////////////////////////////////// - devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id)); - if(devices == 0) - { - fprintf(stderr,"Error: No devices found: %s\n", get_cl_err_string(status)); - return 1; - } - - /* Now, get the device list data */ - status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize*sizeof(cl_device_id), devices, NULL); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Getting Context Info (device list, clGetContextInfo): %s\n", get_cl_err_string(status)); - return 1; - } - - char *deviceName; - size_t dev_name_size = 0; - - int deviceNum = in_flags.GetValueInt("device"); - - clGetDeviceInfo(devices[deviceNum], CL_DEVICE_NAME, sizeof(char*), NULL, &dev_name_size); - deviceName = (char *)malloc(sizeof(char)*dev_name_size); - - clGetDeviceInfo(devices[deviceNum], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); - printf("Device Name: %s\n", deviceName); - - bool use_gcn3 = false; - bool use_gcn2 = false; - char *found_gfx8 = strstr(deviceName, "gfx8"); - char *found_gfx7 = strstr(deviceName, "gfx7"); - if (found_gfx8 != NULL) - use_gcn3 = true; - if (found_gfx7 != NULL) - use_gcn2 = true; - - free(deviceName); - - ///////////////////////////////////////////////////////////////// - // Create an OpenCL command queue - ///////////////////////////////////////////////////////////////// - commandQueue = clCreateCommandQueue(context, devices[deviceNum], CL_QUEUE_PROFILING_ENABLE, &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Creating Command Queue. (clCreateCommandQueue): %s\n", get_cl_err_string(status)); - return 1; - } - - ///////////////////////////////////////////////////////////////// - // Load CL file, build CL program object, create CL kernel object - ///////////////////////////////////////////////////////////////// - char* source; - size_t sourceSize; - convertToStr(&source, &sourceSize, filename); - - syncfree_program = clCreateProgramWithSource(context, 1, (const char**)&source, &sourceSize, &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Loading Binary into cl_program (clCreateProgramWithBinary): %s\n", get_cl_err_string(status)); - return 1; - } - analyze_levelset_program = clCreateProgramWithSource(context, 1, (const char**)&source, &sourceSize, &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Loading Binary into cl_program (clCreateProgramWithBinary): %s\n", get_cl_err_string(status)); - return 1; - } - - std::string buildFlags = "-x clc++ -Dcl_khr_int64_base_atomics=1 -cl-std=CL2.0"; - if (use_gcn3) - buildFlags += " -DGCN3 "; - if (use_gcn2) - buildFlags += " -DGCN2 "; - buildFlags += " -DROW_BITS=" + std::to_string(ROW_BITS); - buildFlags += " -DWG_BITS=" + std::to_string(WG_BITS); - buildFlags += " -DWF_SIZE=" + std::to_string(WF_SIZE); - buildFlags += " -DWF_PER_WG=" + std::to_string(WF_PER_WG); -#ifdef USE_DOUBLE - buildFlags += " -DDOUBLE"; -#endif - - /* create a cl program executable for all the devices specified */ - status = clBuildProgram(analyze_levelset_program, 1, &devices[deviceNum], buildFlags.c_str(), NULL, NULL); - if(status != CL_SUCCESS) - { - printf("Error: Building Analyze and Levelset Program (clBuildProgram): %d\n", status); - char * errorbuf = (char*)calloc(sizeof(char),1024*1024); - size_t size; - clGetProgramBuildInfo(analyze_levelset_program, devices[deviceNum], CL_PROGRAM_BUILD_LOG, 1024*1024, errorbuf, &size); - printf("%s ", errorbuf); - return 1; - } - - buildFlags += " -DSYNCFREE_KERNEL"; - status = clBuildProgram(syncfree_program, 1, &devices[deviceNum], buildFlags.c_str(), NULL, NULL); - if(status != CL_SUCCESS) - { - printf("Error: Building Syncfree Program (clBuildProgram): %d\n", status); - char * errorbuf = (char*)calloc(sizeof(char),1024*1024); - size_t size; - clGetProgramBuildInfo(syncfree_program, devices[deviceNum], CL_PROGRAM_BUILD_LOG, 1024*1024, errorbuf, &size); - printf("%s ", errorbuf); - return 1; - } - - SpTSKernel = clCreateKernel(syncfree_program, "amd_spts_syncfree_solve", &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Creating Kernel from program. (SpTS): %s\n", get_cl_err_string(status)); - return 1; - } - - SpTSKernel_analyze = clCreateKernel(analyze_levelset_program, "amd_spts_analyze_and_solve", &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Creating Kernel from program. (SpTS_analyze): %s\n", get_cl_err_string(status)); - return 1; - } - - SpTSKernel_levelset = clCreateKernel(analyze_levelset_program, "amd_spts_levelset_solve", &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Creating Kernel from program. (SpTS_levelset): %s\n", get_cl_err_string(status)); - return 1; - } - - SpTSKernel_scalar = clCreateKernel(analyze_levelset_program, "amd_spts_scalar_solve", &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Creating Kernel from program. (SpTS_scalar): %s\n", get_cl_err_string(status)); - return 1; - } - - SpTSKernel_vector = clCreateKernel(analyze_levelset_program, "amd_spts_vector_solve", &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Creating Kernel from program. (SpTS_vector): %s\n", get_cl_err_string(status)); - return 1; - } - - SpTSKernel_levelsync = clCreateKernel(analyze_levelset_program, "amd_spts_levelsync_solve", &status); - if(status != CL_SUCCESS) - { - fprintf(stderr,"Error: Creating Kernel from program. (SpTS_levelsync): %s\n", get_cl_err_string(status)); - return 1; - } - - // All good - return 0; -} - -void CLHelper::checkStatus(cl_int status, const std::string errString) -{ - if (status != CL_SUCCESS) - { - std::cerr << errString << " : " << get_cl_err_string(status) << std::endl; - exit(-1); - } -} - -memPointer CLHelper::AllocateMem(const std::string name, - size_t size, - memPointer_flags flags, - void *hostBuffer) -{ - cl_mem buf; - cl_int status; - - buf = clCreateBuffer(context, flags, size, hostBuffer, &status); - std::string errString = "OpenCL error allocating " + name + " !"; - checkStatus(status, errString); - - return buf; -} - -void CLHelper::CopyToDevice(memPointer devBuffer, - void *hostBuffer, - size_t size, - size_t offset, - cl_bool blocking, - cl_event *ev) -{ - cl_int status; - status = clEnqueueWriteBuffer(commandQueue, devBuffer, blocking, offset, size, hostBuffer, 0, NULL, ev); - - checkStatus(status, "OpenCL error copying data to device !"); -} - -void CLHelper::CopyToHost(memPointer devBuffer, - void *hostBuffer, - size_t size, - size_t offset, - cl_bool blocking, - cl_event *ev) -{ - cl_int status; - status = clEnqueueReadBuffer(commandQueue, devBuffer, blocking, offset, size, hostBuffer, 0, NULL, ev); - - checkStatus(status, "OpenCL error copying data to device !"); -} - -int64_t CLHelper::ComputeTime(cl_event event) -{ - int64_t start_time, end_time; - - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(int64_t), &start_time, NULL); - clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(int64_t), &end_time, NULL); - - return end_time - start_time; -} diff --git a/projects/rocshmem/internal/clients/spts/OpenCLHelper.h b/projects/rocshmem/internal/clients/spts/OpenCLHelper.h deleted file mode 100644 index 49a8b83646..0000000000 --- a/projects/rocshmem/internal/clients/spts/OpenCLHelper.h +++ /dev/null @@ -1,108 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ -#ifndef CLHelper_H -#define CLHelper_H - -#define CL_USE_DEPRECATED_OPENCL_2_0_APIS -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS - -#include -#include -#include -#include -#include "InputFlags.h" -#include "GPUHelper.h" - -struct LocalMemArg -{ - LocalMemArg(size_t _size) : size(_size) {} - size_t GetSize() const { return size; } - - private: - size_t size; -}; - -class CLHelper : public GPUHelper -{ - cl_platform_id platform; - cl_device_id *devices; - cl_program syncfree_program; - cl_program analyze_levelset_program; - - public: - static cl_context context; - static cl_kernel SpTSKernel; - static cl_kernel SpTSKernel_analyze; - static cl_kernel SpTSKernel_levelset; - static cl_kernel SpTSKernel_scalar; - static cl_kernel SpTSKernel_vector; - static cl_kernel SpTSKernel_levelsync; - static cl_command_queue commandQueue; - - CLHelper() {} - int Init(const std::string &_filename, InputFlags &in_flags); - void checkStatus(gpuError status, const std::string errString); - void CopyToDevice(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, cl_bool _blocking, cl_event *_ev); - void CopyToHost(memPointer _d_buf, void *_h_buf, size_t _size, size_t _offset, cl_bool _blocking, cl_event *_ev); - memPointer AllocateMem(const std::string name, size_t, memPointer_flags flags, void *); - void FreeMem(memPointer ptr) { clReleaseMemObject(ptr); } - void Flush() { clFinish(commandQueue); } - - template - void SetArgs(cl_kernel, int i, const T& first, const Args&... rest); - template - void SetArgs(cl_kernel, int i, const LocalMemArg &lmem, const Args&... rest); - void SetArgs(cl_kernel, int i) {} - - int64_t ComputeTime(cl_event event); - -}; - -template -void CLHelper::SetArgs(cl_kernel kernel, int i, const T& first, const Args&... rest) -{ - cl_int status; - - status = clSetKernelArg(kernel, i++, sizeof(T), (void *)& first); - std::stringstream errStream; - errStream<<"OpenCL error setting kernel argument "< -void CLHelper::SetArgs(cl_kernel kernel, int i, const LocalMemArg &lmem, const Args&... rest) -{ - cl_int status; - status = clSetKernelArg(kernel, i++, lmem.GetSize(), NULL); - std::stringstream errStream; - errStream<<"OpenCL error setting kernel argument (local memory) "< -#include -#include -#include -#include -#include - -#include -#include - -#include - - -#ifdef USE_ROCSHMEM -#include "rocshmem.hpp" -#include "mpi.h" -#endif - -#ifdef USE_HIP -#include "spts_kernel.h" -#endif - -#ifdef DBL_DECIMAL_DIG - #define OP_DBL_Digs (DBL_DECIMAL_DIG) -#else - #ifdef DECIMAL_DIG - #define OP_DBL_Digs (DECIMAL_DIG) - #else - #define OP_DBL_Digs (DBL_DIG + 3) - #endif -#endif - -#ifdef FLT_DECIMAL_DIG - #define OP_FLT_Digs (FLT_DECIMAL_DIG) -#else - #ifdef DECIMAL_DIG - #define OP_FLT_Digs (DECIMAL_DIG) - #else - #define OP_FLT_Digs (FLT_DIG + 3) - #endif -#endif - -using namespace rocshmem; - -template -class SparseTriangularSolve : - public InputFlags, public SparseMatrix -{ - FloatType *x; - FloatType *y; - FloatType *y_zero; - FloatType *yref; - std::vector rowBlocks; - - memPointer xDev; - memPointer yDev; - memPointer completedRowsDev; - memPointer rowBlocksDev; - memPointer doneArrayDev; - memPointer shadowDoneArrayDev; - memPointer remoteInProgressArrayDev; - memPointer reqUpdateArrayDev; - memPointer numRowsAtLevelDev; - memPointer maxDepthDev; - memPointer rowMapDev; - memPointer totalSpinDev; - memPointer oneBufDev; - - int nNZ; - int nRows; - int nCols; - int numBlocks; -/* - #ifdef USE_ROCSHMEM - rocshmem_t* handle; - #endif -*/ - std::unordered_map *observed_errors; - int *errors_seen; - - public: - - SparseTriangularSolve() : nNZ(0), nRows(0), nCols(0), numBlocks(0) - { - x = NULL; y = NULL; y_zero = NULL, yref = NULL, observed_errors = NULL, errors_seen = NULL; - xDev = yDev = completedRowsDev = remoteInProgressArrayDev = rowBlocksDev = doneArrayDev = shadowDoneArrayDev = numRowsAtLevelDev = maxDepthDev = rowMapDev = totalSpinDev = oneBufDev = 0; - - #ifdef USE_ROCSHMEM - int rocshmem_queues = (2560 / WF_PER_WG); - if (2560 % WF_PER_WG) - rocshmem_queues++; - printf("rocshmem_queues %d WF_PER_WG %d \n",rocshmem_queues, WF_PER_WG); - rocshmem_init(rocshmem_queues); - - this->Set_total_pes(rocshmem_n_pes()); - this->Set_this_pe(rocshmem_my_pe()); - #else - this->Set_total_pes(1); - this->Set_this_pe(0); - #endif - } - - void AddDerivedInputFlags(); - void AllocateVectors(MatrixMarketReader &mm_reader); - void CSRSpTSCPU(FloatType alpha); - bool CSRCheckCPU(FloatType alpha); - - float CSRSpTSGPU(uint64_t &ns_per_iter, uint64_t &ns_per_analysis_iter, uint64_t &ns_per_syncfree_iter, uint64_t &ns_per_levelset_iter, uint64_t &ns_per_levelsync_iter, FloatType alpha); - - int VerifyResults(int); - int NonDeterministicErrors(); - int MaxErrors(); - int ComputeRowBlocks(std::vector &, int *, int); - - ~SparseTriangularSolve() - { - if (x != NULL) - delete[] x; - if (y != NULL) - delete[] y; - if (y_zero != NULL) - delete[] y_zero; - if (yref != NULL) - delete[] yref; - if (errors_seen != NULL) - delete[] errors_seen; - - if (xDev != 0) - this->GPU->FreeMem(xDev); - if (rowBlocksDev != 0) - this->GPU->FreeMem(rowBlocksDev); - if (completedRowsDev != 0) - this->GPU->FreeMem(completedRowsDev); - if (numRowsAtLevelDev != 0) - this->GPU->FreeMem(numRowsAtLevelDev); - if (maxDepthDev != 0) - this->GPU->FreeMem(maxDepthDev); - if (rowMapDev != 0) - this->GPU->FreeMem(rowMapDev); - if (totalSpinDev != 0) - this->GPU->FreeMem(totalSpinDev); - if (oneBufDev != 0) - this->GPU->FreeMem(oneBufDev); - if (remoteInProgressArrayDev != 0) - this->GPU->FreeMem(remoteInProgressArrayDev); - - #ifndef USE_ROCSHMEM - if (yDev != 0) - this->GPU->FreeMem(yDev); - if (doneArrayDev != 0) - this->GPU->FreeMem(doneArrayDev); - if (reqUpdateArrayDev != 0) - this->GPU->FreeMem(reqUpdateArrayDev); - if (shadowDoneArrayDev != 0) - this->GPU->FreeMem(shadowDoneArrayDev); - #else - if (yDev != 0) - rocshmem_free(yDev); - if (doneArrayDev != 0) - rocshmem_free(doneArrayDev); - if (reqUpdateArrayDev != 0) - rocshmem_free(reqUpdateArrayDev); - if (shadowDoneArrayDev != 0) - rocshmem_free(shadowDoneArrayDev); - rocshmem_finalize(); - #endif - } -}; - - template -void SparseTriangularSolve::AddDerivedInputFlags() -{ - AddInputFlag("filename", 'f', "", "Matrix-Market File", "string"); - AddInputFlag("iterations", 'i', "10", "Number of SpTS Iterations (Default=10)", "int"); - AddInputFlag("exp_zeroes", 'z', "false", "Include Explicit Zeroes in Matrix-Market File (Default=false)", "bool"); - AddInputFlag("device", 'd', "0", "Choose the GPU to Execute SpTS (Default=0)", "int"); - AddInputFlag("alpha", 'A', "1.0", "A*y=alpha*x. Known vector 'x' is multiplied by scalar alpha befoer solving for vector 'y'. (Default=1.0)", "float"); - AddInputFlag("non_symmetric", 'n', "false", "Force the program to work on non-symmetric matrices. This will ignore the upper triangular entirely. (Default=false)", "bool"); - AddInputFlag("levelsync_size", 'l', "0", "Number of rows to launch in a level-sync kernel invocation (Default = auto-tune)", "int"); - AddInputFlag("verify", 'v', "false", "Verify results", "bool"); - AddInputFlag("rocshmem_algorithm", 'a', "0", "rocSHMEM algorithm type", "int"); - AddInputFlag("block_size", 'b', "32768", "Use get-based algorithm for rocSHMEM", "int"); - AddInputFlag("put_block_size", 'p', "1024", "Block size for puts", "int"); - AddInputFlag("get_backoff_factor", 'g', "128", "Backoff factor for gets", "int"); -} - - template -void SparseTriangularSolve::AllocateVectors( - MatrixMarketReader &mm_reader) -{ - nRows = mm_reader.GetNumRows(); - nCols = mm_reader.GetNumCols(); - nNZ = mm_reader.GetNumNonZeroes(); - - x = new FloatType[nCols]; - y = new FloatType[nRows]; - y_zero = new FloatType[nRows]; - yref = new FloatType[nRows]; - observed_errors = new std::unordered_map[InputFlags::GetValueInt("iterations")]; - - for(int i = 0; i < nRows; i++) - { - y[i] = (FloatType)0.0; - y_zero[i] = (FloatType)0.0; - yref[i] = (FloatType)0.0; - } - - for(int i = 0; i < nCols; i++) - { - //x[i] = (FloatType)rand() / (FloatType)RAND_MAX; - x[i] = 2.; - } - - xDev = this->GPU->AllocateMem("xDev", nCols*sizeof(FloatType), GPU_MEM_READ_ONLY, NULL); - #ifndef USE_ROCSHMEM - yDev = this->GPU->AllocateMem("yDev", nRows*sizeof(FloatType), GPU_MEM_READ_WRITE, NULL); - #else - yDev = (memPointer) rocshmem_malloc(nRows*sizeof(FloatType)); - #endif -} - - template -void SparseTriangularSolve::CSRSpTSCPU(FloatType alpha) -{ - FloatType *NZvalues = SparseMatrix::GetVals(); - int *Cols = SparseMatrix::GetCols(); - int *rowptrs = SparseMatrix::GetRowPtrs(); - double internal_alpha = alpha; - - uint64_t local_nnz = 0; - uint64_t remote_nnz = 0; - uint64_t rows_with_nonlocal = 0; - - for(int i = 0; i < nRows; i++) - { - bool row_has_nonlocal = false; - double diagonal = 0.; - double temp = 0.; - int diag_j = -1; - for(int j = rowptrs[i]; j < rowptrs[i+1]; j++) - { - int ci = Cols[j]; - int row_pe = (i / SPTS_BLOCK_SIZE) % this->Get_total_pes(); - int col_pe = (ci / SPTS_BLOCK_SIZE) % this->Get_total_pes(); - - int assigned_pe = (i / SPTS_BLOCK_SIZE) % this->Get_total_pes(); - if (assigned_pe == this->Get_this_pe()) { - if (row_pe == col_pe) { - local_nnz++; - } else { - row_has_nonlocal = true; - remote_nnz++; - } - } - - // Skip adding in the diagonal. We need to solve for that. - if (ci != i) - { - if (i == TEST_NUM) - fprintf(stderr, "NZvalues[%d](%lf) * yref[%d](%lf)\n", j, NZvalues[j], ci, yref[ci]); - temp += NZvalues[j] * yref[ci]; - } - else - { - if (i==TEST_NUM) - fprintf(stderr, "\t\tDIAG = %lf\n", NZvalues[j]); - diagonal = NZvalues[j]; - diag_j = j; - } - } - if (row_has_nonlocal) rows_with_nonlocal++; - if (diag_j == -1) - { - fflush(stdout); - printf("\nERROR in SpTS CPU\n"); - printf("No diagonal found in row %d\n", i); - } - // y = (x-sum_of_vals_from_A) / diag - double alpha_x = internal_alpha * (double)x[i]; - if (i == TEST_NUM) - { - char buf[128]; - char buf2[128]; - char buf3[128]; - snprintf(buf, sizeof(buf), "%.20f", alpha_x); - snprintf(buf2, sizeof(buf2), "%.20f", internal_alpha); - fprintf(stderr, "alpha_x: %s (%s * %lf)\n", buf, buf2, x[i]); - snprintf(buf3, sizeof(buf3), "%.20f", temp); - fprintf(stderr, "temp: %s\n", buf3); - } - yref[i] = (FloatType)((alpha_x - temp)/diagonal); - if (i == TEST_NUM) - fprintf(stderr, "\tsupposed answer [%d]: %lf\n", i, yref[i]); - } - double ratio = ((double) local_nnz) / ((double) remote_nnz + local_nnz); - double rows_remote_ratio = ((double) rows_with_nonlocal) / ((double) this->nRows_p); - if (this->Get_this_pe() == 0) { - printf("\nRANK 0: global NNZ = %lu\n", remote_nnz + local_nnz); - printf("RANK 0: global Rows = %d\n", nRows); - } - printf("\nLOCALITY %d : Remote/Local cols %lu/%lu Fraction Columns Local %f Fraction Rows with Remote Columns %f\n", this->Get_this_pe(), remote_nnz, local_nnz, ratio, rows_remote_ratio); -} - - template -bool SparseTriangularSolve::CSRCheckCPU(FloatType alpha) -{ - FloatType *NZvalues = SparseMatrix::GetVals(); - int *Cols = SparseMatrix::GetCols(); - int *rowptrs = SparseMatrix::GetRowPtrs(); - double internal_alpha = alpha; - bool all_worked = true; - -#pragma omp parallel for - for(int i = 0; i < nRows; i++) - { -#pragma omp flush (all_worked) - if (all_worked) - { - double temp = 0.; - for(int j = rowptrs[i]; j < rowptrs[i+1]; j++) - { - int ci = Cols[j]; - // Skip anything that lies on the diagonal. We need to solve for that. - temp += NZvalues[j] * yref[ci]; - } - double compare_val = 0.; - double alpha_x = internal_alpha * x[i]; - if(typeid(FloatType) == typeid(float)) - { - compare_val = fabs(alpha_x*1e-3); - if (compare_val < 10*FLT_EPSILON) - compare_val = 10*FLT_EPSILON; - if ((FloatType)(alpha_x - compare_val) > (FloatType)temp || (FloatType)(alpha_x + compare_val) < (FloatType)temp) - { - fflush(stdout); - fprintf(stderr, " CPU CALCULATION ERROR on row %d\n", i); - fprintf(stderr, "\tReal value for row %d: %.*e\n", i, OP_FLT_Digs-1, (float)alpha_x); - fprintf(stderr, "\tCalculated value for row %d: %.*e\n", i, OP_FLT_Digs-1, (float)temp); - all_worked = false; -#pragma omp flush (all_worked) - } - } - else if(typeid(FloatType) == typeid(double)) - { - compare_val = fabs(alpha_x*1e-4); - if (compare_val < 10*DBL_EPSILON) - compare_val = 10*DBL_EPSILON; - if ((FloatType)(alpha_x - compare_val) > (FloatType)temp || (FloatType)(alpha_x + compare_val) < (FloatType)temp) - { - fflush(stdout); - fprintf(stderr, " CPU CALCULATION ERROR on row %d\n", i); - fprintf(stderr, "\tReal value for row %d: %.*le\n", i, OP_DBL_Digs-1, (double)alpha_x); - fprintf(stderr, "\tCalculated value for row %d: %.*le\n", i, OP_DBL_Digs-1, (double)temp); - all_worked = false; -#pragma omp flush (all_worked) - } - } - } - } - return all_worked; -} - - template<> -int SparseTriangularSolve::VerifyResults(int iteration) -{ - int errors = 0; - - #pragma omp parallel for - for (int i = 0; i < nRows; i++) - { - int assigned_pe = (i / SPTS_BLOCK_SIZE) % this->Get_total_pes(); - if (this->Get_this_pe() == assigned_pe) { - float compare_val = fabs(yref[i]*1e-3); - if (compare_val < 10*FLT_EPSILON) - compare_val = 10*FLT_EPSILON; - if ((yref[i] - compare_val) > y[i] || (yref[i] + compare_val) < y[i]) - { - #pragma omp critical - { - if(errors == 0) - { - fflush(stdout); - fprintf(stderr, "\nDetected some differences between CPU and GPU results on iteration %d...", iteration); - } - fprintf(stderr, "%d GPU CALCULATION ERROR on row %d\n", this->Get_this_pe(), i); - fprintf(stderr, "\tCPU value for y[%d]: %.*e\n", i, OP_FLT_Digs-1, yref[i]); - fprintf(stderr, "\tGPU value for y[%d]: %.*e\n", i, OP_FLT_Digs-1, y[i]); - errors += 1; - observed_errors[iteration].insert(std::pair (i, y[i])); - } - } - } - } - return errors; -} - - template<> -int SparseTriangularSolve::VerifyResults(int iteration) -{ - int errors = 0; - #pragma omp parallel for - for (int i = 0; i < nRows; i++) - { - double compare_val = fabs(yref[i]*1e-4); - if (compare_val < 10*DBL_EPSILON) - compare_val = 10*DBL_EPSILON; - if ((yref[i] - compare_val) > y[i] || (yref[i] + compare_val) < y[i]) - { - #pragma omp critical - { - if(errors == 0) - { - fflush(stdout); - fprintf(stderr, "\nDetected differences between CPU and GPU results on iteration %d...", iteration); - } - fprintf(stderr, "GPU CALCULATION ERROR on row %d\n", i); - fprintf(stderr, "\tCPU value for y[%d]: %.*e\n", i, OP_DBL_Digs-1, yref[i]); - fprintf(stderr, "\tGPU value for y[%d]: %.*e\n", i, OP_DBL_Digs-1, y[i]); - errors += 1; - observed_errors[iteration].insert(std::pair (i, y[i])); - } - } - } - return errors; -} - -template -int SparseTriangularSolve::NonDeterministicErrors() -{ - int iter = InputFlags::GetValueInt("iterations"); - int non_det_errors = 0; -#ifdef ALL_SYNCFREE - for (int i = 1; i < iter; i++) - { - if (errors_seen[i] != errors_seen[0]) - { - non_det_errors++; - if (non_det_errors == 1) - { - fprintf(stderr, "Different SpTS iterations saw different error counts -- non-deterministic bug possible.\n"); - fprintf(stderr, "\te.g. saw %d errors during iteration 0. Saw %d errors during iteration %i\n", errors_seen[0], errors_seen[i], i); - } - } - else if (observed_errors[i] != observed_errors[0]) - { - non_det_errors++; - if (non_det_errors == 1) - { - fprintf(stderr, "ERRORS were seen. Different iterations saw errors on different rows -- non-deterministic bug possible.\n"); - fprintf(stderr, "\te.g. Iterations 0 and %d were different.\n", i); - } - } - } -#else - if (iter >= 1) - { - if (errors_seen[0] != errors_seen[1]) - { - non_det_errors++; - fprintf(stderr, "Different SpTS iterations saw different error counts -- non-deterministic bug possible.\n"); - fprintf(stderr, "\te.g. saw %d errors during iteration 0. Saw %d errors during iteration %i\n", errors_seen[0], errors_seen[1], 1); - } - } - for (int i = 2; i < iter; i++) - { - if (errors_seen[i] != errors_seen[1]) - { - non_det_errors++; - if (non_det_errors == 1) - { - fprintf(stderr, "Different SpTS iterations saw different error counts -- non-deterministic bug possible.\n"); - fprintf(stderr, "\te.g. saw %d errors during iteration 1. Saw %d errors during iteration %i\n", errors_seen[1], errors_seen[i], i); - } - } - else if (observed_errors[i] != observed_errors[1]) - { - non_det_errors++; - if (non_det_errors == 1) - { - fprintf(stderr, "ERRORS were seen. Different iterations saw errors on different rows -- non-deterministic bug possible.\n"); - fprintf(stderr, "\te.g. Iterations 1 and %d were different.\n", i); - } - } - } -#endif - return non_det_errors; -} - -template -int SparseTriangularSolve::MaxErrors() -{ - int iter = InputFlags::GetValueInt("iterations"); - int max_errors = 0; - for (int i = 0; i < iter; i++) - { - if (errors_seen[i] > max_errors) - max_errors = errors_seen[i]; - } - return max_errors; -} - -#ifndef __has_builtin -#define __has_builtin(x) 0 -#endif - -static inline unsigned int flp2(unsigned int x) -{ - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - x |= (x >> 8); - x |= (x >> 16); - return x - (x >> 1); -} - -// Short rows in CSR-Adaptive are batched together into a single row block. -// If there are a relatively small number of these, then we choose to do -// a horizontal reduction (groups of threads all reduce the same row). -// If there are many threads (e.g. more threads than the maximum size -// of our workgroup) then we choose to have each thread serially reduce -// the row. -// This function calculates the number of threads that could team up -// to reduce these groups of rows. For instance, if you have a -// workgroup size of 256 and 4 rows, you could have 64 threads -// working on each row. If you have 5 rows, only 32 threads could -// reliably work on each row because our reduction assumes power-of-2. - template< typename rowBlockType > -static inline rowBlockType numThreadsForReduction(const rowBlockType num_rows) -{ -#if defined(__INTEL_COMPILER) - return 256 >> (_bit_scan_reverse(num_rows-1)+1); -#elif (defined(__clang__) && __has_builtin(__builtin_clz)) || \ - !defined(__clang) && \ - defined(__GNUG__) && ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202) - return (256 >> (8*sizeof(int)-__builtin_clz(num_rows-1))); -#elif defined(_MSC_VER) && (_MSC_VER >= 1400) - unsigned long bit_returned; - _BitScanReverse(&bit_returned, (num_rows-1)); - return 256 >> (bit_returned+1); -#else - return flp2(256/num_rows); -#endif -} - - template -int SparseTriangularSolve::ComputeRowBlocks(std::vector &rowBlocks, - int *rowDelimiters, - int nRows) -{ - rowBlocks.erase(rowBlocks.begin(), rowBlocks.end()); - rowBlocks.push_back(0); - uint64_t sum = 0; - uint64_t i, last_i = 0; - - // Check to ensure nRows can fit in 32 bits - if ((uint64_t) nRows > (uint64_t)pow(2, ROW_BITS)) - { - fflush(stdout); - fprintf(stderr, "\nNumber of Rows in the Sparse Matrix is greater than what is supported at present (%d bits) !", ROW_BITS ); - exit(0); - } - - int consecutive_long_rows = 0; - for(i = 1; i <= nRows; i++) - { - int row_length = ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] ); - sum += row_length; - - // The following section of code calculates whether you're moving between - // a series of "short" rows and a series of "long" rows. - // This is because the reduction in CSR-Adaptive likes things to be - // roughly the same length. Long rows can be reduced horizontally. - // Short rows can be reduced one-thread-per-row. Try not to mix them. - if ( row_length > 128 ) - consecutive_long_rows++; - else if ( consecutive_long_rows > 0 ) - { - // If it turns out we WERE in a long-row region, cut if off now. - if (row_length < 32) // Now we're in a short-row region - consecutive_long_rows = -1; - else - consecutive_long_rows++; - } - - // If you just entered into a "long" row from a series of short rows, - // then we need to make sure we cut off those short rows. Put them in - // their own workgroup. - if ( consecutive_long_rows == 1 ) - { - // Assuming there *was* a previous workgroup. If not, nothing to do here. - if( i - last_i > 1 ) - { - rowBlocks.push_back( (i - 1) << (64 - ROW_BITS) ); - // If this row fits into CSR-Stream, calculate how many rows - // can be used to do a parallel reduction. - // Fill in the low-order bits with the numThreadsForRed - if (((i-1) - last_i) > 2) - rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction((i - 1) - last_i); - - last_i = i-1; - sum = row_length; - } - } - else if (consecutive_long_rows == -1) - { - // We see the first short row after some long ones that - // didn't previously fill up a row block. - rowBlocks.push_back( (i - 1) << (64 - ROW_BITS) ); - if (((i-1) - last_i) > 2) - rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction((i - 1) - last_i); - - last_i = i-1; - sum = row_length; - consecutive_long_rows = 0; - } - - // Now, what's up with this row? What did it do? - - // exactly one row results in non-zero elements to be greater than blockSize - // This is csr-vector case; bottom WG_BITS == workgroup ID - if( ( i - last_i == 1 ) && sum > 1024 ) - { - int numWGReq = ceil( (double)row_length / (1024) ); - - // Check to ensure #workgroups can fit in WG_BITS bits, if not - // then the last workgroup will do all the remaining work - numWGReq = ( numWGReq < (int)pow( 2, WG_BITS ) ) ? numWGReq : (int)pow( 2, WG_BITS ); - - for( int w = 1; w < numWGReq; w++ ) - { - rowBlocks.push_back((i-1) << ROW_BITS); - rowBlocks[rowBlocks.size() - 1] |= (uint64_t)w; - } - rowBlocks.push_back(i << ROW_BITS); - - last_i = i; - sum = 0; - consecutive_long_rows = 0; - } - // more than one row results in non-zero elements to be greater than blockSize - // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads - else if( ( i - last_i > 1 ) && sum > 1024 ) - { - i--; // This row won't fit, so back off one. - rowBlocks.push_back( i << (64 - ROW_BITS) ); - if ((i - last_i) > 2) - rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction(i - last_i); - last_i = i; - sum = 0; - consecutive_long_rows = 0; - } - // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads - else if( sum == 1024 ) - { - rowBlocks.push_back( i << (64 - ROW_BITS) ); - if ((i - last_i) > 2) - rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction(i - last_i); - last_i = i; - sum = 0; - consecutive_long_rows = 0; - } - } - - // If we didn't fill a row block with the last row, make sure we don't lose it. - if ( (rowBlocks[rowBlocks.size() - 2] >> (64 - ROW_BITS)) != (uint64_t)(nRows) ) - { - rowBlocks.push_back( (uint64_t)( nRows ) << (64 - ROW_BITS) ); - if ((nRows - last_i) > 2) - rowBlocks[rowBlocks.size() - 2] |= numThreadsForReduction(i - last_i); - } - - return rowBlocks.size(); -} - - template -float SparseTriangularSolve::CSRSpTSGPU(uint64_t &ns_per_iter, uint64_t &ns_per_analysis_iter, uint64_t &ns_per_syncfree_iter, uint64_t &ns_per_levelset_iter, uint64_t &ns_per_levelsync_iter, FloatType alpha) -{ - gpuInt status; - gpuEvent* event_array; - #ifdef USE_HIP - hipSetDevice(this->Get_this_pe()); - hipDeviceProp_t props; - hipGetDeviceProperties(&props, this->Get_this_pe()); - printf("\nPE %d: PCIe BUS ID %d DEV ID %d\n", this->Get_this_pe(), props.pciBusID, props.pciDeviceID); - event_array = (gpuEvent*)malloc(sizeof(gpuEvent) * 2); - hipEventCreate(&event_array[0]); - hipEventCreate(&event_array[1]); - #else - event_array = (gpuEvent*)malloc(sizeof(gpuEvent)); - #endif - size_t global_work_size; - size_t local_work_size = WF_PER_WG * WF_SIZE; - - - /*************************** Setup and create buffers ********************/ - /****** Matrix Setup Code ******/ - /* Get the OpenCL buffers for the input matrix */ - memPointer bufNonZeroes = SparseMatrix::GetDevVals(); - memPointer bufColumnIndices = SparseMatrix::GetDevCols(); - memPointer bufRowPtrs = SparseMatrix::GetDevRowPtrs(); - /* Get the host buffers for the input matrix */ - FloatType *Avalues = SparseMatrix::GetVals(); - int *Acols = SparseMatrix::GetCols(); - int *rowptrs = SparseMatrix::GetRowPtrs(); - - - /****** Adaptive RowBlocks Setup Code ******/ - numBlocks = ComputeRowBlocks(rowBlocks, rowptrs, nRows); - rowBlocksDev = this->GPU->AllocateMem("rowBlocks", numBlocks*sizeof(int64_t), GPU_MEM_READ_WRITE, NULL); - uint64_t completedRows = 0; - completedRowsDev = this->GPU->AllocateMem("completedRows", sizeof(uint64_t), GPU_MEM_READ_WRITE|GPU_MEM_USE_HOST_PTR, &completedRows); - - /****** SpTS Meta-Data Setup Code ******/ - /* Set up the OpenCL buffers for the SpTS meta-data */ - // TODO -- is this +1 in doneArray nRows+1 required? Why? - #ifdef USE_ROCSHMEM - doneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t)); - reqUpdateArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t)); - shadowDoneArrayDev = rocshmem_malloc((nRows+1)*sizeof(uint32_t)); - #else - doneArrayDev = this->GPU->AllocateMem("doneArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); - reqUpdateArrayDev = this->GPU->AllocateMem("reqUpdateArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); - shadowDoneArrayDev = this->GPU->AllocateMem("shadowDoneArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); - #endif - remoteInProgressArrayDev = this->GPU->AllocateMem("remoteInProgressArray", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); - numRowsAtLevelDev = this->GPU->AllocateMem("numRowsAtLevel", (nRows)*sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); - rowMapDev = this->GPU->AllocateMem("rowMap", (nRows+1)*sizeof(uint32_t), GPU_MEM_READ_ONLY, NULL); - maxDepthDev = this->GPU->AllocateMem("maxDepth", sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); - totalSpinDev = this->GPU->AllocateMem("totalSpin", sizeof(uint64_t), GPU_MEM_READ_WRITE, NULL); - oneBufDev = this->GPU->AllocateMem("oneBuf", sizeof(uint32_t), GPU_MEM_READ_WRITE, NULL); - /* Set up the host buffers for the SpTS meta-data */ - uint32_t *doneArray = (uint32_t*)calloc((nRows+1), sizeof(uint32_t)); - uint32_t *numRowsAtLevel = (uint32_t*)calloc(nRows, sizeof(uint32_t)); - uint32_t *rowMap = (uint32_t*)calloc((nRows+1), sizeof(uint32_t)); - uint32_t maxDepth = 0; - uint64_t totalSpin = 0; - - uint32_t *nrows_plus1_zero = (uint32_t*)calloc((nRows+1), sizeof(uint32_t)); - uint64_t u64_zero = 0; - uint32_t u32_zero = 0; - - //uint32_t uns_int_one = 0x42280000; - uint32_t u32_one = 1; - - // TODO: Gather and flatten out Avalues, Acols, and rowptrs based on - // row cyclic decomposition. For now, we just copy the hole vals, cols, - // and row_ptrs matrix, even though we really only access 1/num_pes of the - // whole thing. We can do some more sophisticated stuff here if we run out - // of space on the GPU or we don't like the copy overheads of the initial - // buffers. - - /************************ Copy initial buffers to device *****************/ - /****** Copy matrix ******/ - this->GPU->CopyToDevice(bufNonZeroes, Avalues, this->nNZ*sizeof(FloatType), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(bufColumnIndices, Acols, this->nNZ*sizeof(int), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(bufRowPtrs, rowptrs, (this->nRows+1)*sizeof(int), 0, GPU_TRUE, NULL); - - /****** Copy vectors ******/ - this->GPU->CopyToDevice(xDev, x, nCols*sizeof(FloatType), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(yDev, y_zero, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL); - - /****** Copy adaptive rowBlock information ******/ - this->GPU->CopyToDevice(rowBlocksDev, rowBlocks.data(), numBlocks*sizeof(int64_t), 0, GPU_TRUE, NULL); - - /****** Copy SpTS meta-data needed for analyze_and_solve run ******/ - this->GPU->CopyToDevice(doneArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(shadowDoneArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(reqUpdateArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(remoteInProgressArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(numRowsAtLevelDev, nrows_plus1_zero, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(maxDepthDev, &u32_zero, sizeof(uint32_t), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(totalSpinDev, &u64_zero, sizeof(uint64_t), 0, GPU_TRUE, NULL); - this->GPU->CopyToDevice(oneBufDev, &u32_one, sizeof(uint32_t), 0, GPU_TRUE, NULL); - - /************************** Set up iteration printing ********************/ - /* We want to print, ideally, every iteration that gets up 10% closer to - * completion. This sets that up */ - int iter = InputFlags::GetValueInt("iterations"); - double print_iter = (float)iter / 10.; - if (print_iter < 1.) - print_iter = 1.; - double next_to_print = 0.; - - - /**************************** Set up perf analysis ************************/ - // For performance analysis, keep track of how much time we've spent doing - // kernel work. - // TODO -- Take in from the command line whether to get kernel or total time. - // If doing total time, try launching all of the kernels at once and waiting - // outside. This will apparently reduce the overheads. - uint64_t total_kern_time = 0; - uint64_t analyze_kern_time = 0; - double analyze_kern_flops = 0.; - uint64_t syncfree_kern_time = 0; - uint64_t levelset_kern_time = 0; - uint64_t levelsync_kern_time = 0; - - errors_seen = new int[iter]; - - int analysis_iter = 0; - int syncfree_iter = 0; - int levelset_iter = 0; - int levelsync_iter = 0; - - int level_sync_cutoff = InputFlags::GetValueInt("levelsync_size"); - bool syncfree_better = false; - - int total_workitems_per_workgroup = WF_SIZE * WF_PER_WG; - //bool rocshmem_initialized = false; - - /*********************** Actual work of the benchmark *********************/ - for(int i = 0; i < iter; i++) - { - if (i == (int)next_to_print || i == (iter - 1)) - { - printf("%d..", i+1);fflush(stdout); - next_to_print += print_iter; - } - -#ifndef ALL_SYNCFREE -#ifdef ALL_ANALYZE - // When we only want to run the analyze-and-solve mechanism, rather than - // the more optimized syncfree algorithm, we always go into here. - if (1) -#else - // In any version of the program that has the possibility of running the - // level-set algorithm, we need to start with the syncfree-and-analyze - // version of the program, so that we can set up the potential to run the - // level-set algorithm. This will take place on the first iteration. - if (i == 0) -#endif - { - analysis_iter++; - global_work_size = nRows * WF_SIZE; - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - CL->SetArgs(CLHelper::SpTSKernel_analyze, 0, - bufNonZeroes, - bufColumnIndices, - bufRowPtrs, - xDev, - yDev, - alpha, - doneArrayDev, - numRowsAtLevelDev, - maxDepthDev, - totalSpinDev); - - status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_analyze, 1, NULL, &global_work_size, NULL, 0, NULL, &event_array[0]); - CL->checkStatus(status,"clEnqueueNDRangeKernel failed"); - this->GPU->Flush(); - total_kern_time += CL->ComputeTime(event_array[0]); - analyze_kern_time += CL->ComputeTime(event_array[0]); - #else - int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - #ifdef USE_ROCSHMEM - global_work_size = this->nRows_p * WF_SIZE; - num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - /* - int rocshmem_queues = (2560 / WF_PER_WG); - if (2560 % WF_PER_WG) - rocshmem_queues++; - if (!rocshmem_initialized) { - int num_threads = InputFlags::GetValueInt("num_roshmem_threads"); - rocshmem_init(&handle, rocshmem_queues); - rocshmem_initialized = true; - } - */ - int rocshmem_algorithm = InputFlags::GetValueInt("rocshmem_algorithm"); - int rocshmem_put_block_size = InputFlags::GetValueInt("put_block_size"); - int rocshmem_get_backoff_factor = InputFlags::GetValueInt("get_backoff_factor"); - switch (rocshmem_algorithm) { - case 0: - printf("Using Put-based intra-kernel algorithm\n"); - break; - case 1: - printf("Using Get-based intra-kernel algorithm (Backoff factor %d)\n", rocshmem_get_backoff_factor); - break; - case 2: - printf("Using blocked Put-based intra-kernel algorithm\n"); - printf("Using blocked Put-based intra-kernel algorithm (Block Size %d)\n", rocshmem_put_block_size); - break; - case 3: - printf("Using put/get hybrid intra-kernel algorithm\n"); - break; - default: - printf("Unknown rocSHMEM algorithm\n"); - exit(-1); - } - size_t LDS_size; - rocshmem_dynamic_shared(&LDS_size); - printf("Work size %zu, wg size %d num workgroups %d LDS %zu thisPE %d Global %d \n", global_work_size, total_workitems_per_workgroup, num_of_workgroups, LDS_size, this->Get_this_pe(), this->Get_total_pes()); - MPI_Barrier(MPI_COMM_WORLD); - hipEventRecord(event_array[0], NULL); - hipLaunchKernelGGL(amd_spts_analyze_and_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - LDS_size, 0, - global_work_size, - this->Get_this_pe(), - this->Get_total_pes(), - static_cast(shadowDoneArrayDev), - static_cast(reqUpdateArrayDev), - static_cast(remoteInProgressArrayDev), - static_cast(oneBufDev), - rocshmem_algorithm, - rocshmem_put_block_size, - rocshmem_get_backoff_factor, - SPTS_BLOCK_SIZE, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - alpha, - static_cast(doneArrayDev), - static_cast(numRowsAtLevelDev), - static_cast(maxDepthDev), - static_cast(totalSpinDev)); - #else - hipEventRecord(event_array[0], NULL); - hipLaunchKernelGGL(amd_spts_analyze_and_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - 0, 0, - global_work_size, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - alpha, - static_cast(doneArrayDev), - static_cast(numRowsAtLevelDev), - static_cast(maxDepthDev), - static_cast(totalSpinDev)); - #endif - hipEventRecord(event_array[1], NULL); - hipEventSynchronize(event_array[1]); - - #ifdef USE_ROCSHMEM - // Wait for any outstanding network messages to finish up. We - // can have straggler updates to the doneArray that we don't - // have any dependencies for but we still eed it to finish so - // the below statistics can work correctly. - //ro_shmem_dump_stats(handle); - //ro_shmem_reset_stats(handle); - //sleep(10); - /* if( this->Get_this_pe() == 0 && (this->Get_total_pes() > 1)){ - PRINT_SQ(get_rtn_handle(handle), 0, 1, 0); - PRINT_CQ(get_rtn_handle(handle), 0, 1, 0); - PRINT_SQ(get_rtn_handle(handle), 0, 1, 1); - PRINT_CQ(get_rtn_handle(handle), 0, 1, 1); - PRINT_SQ(get_rtn_handle(handle), 0, 1, 2); - PRINT_CQ(get_rtn_handle(handle), 0, 1, 2); - }*/ - MPI_Barrier(MPI_COMM_WORLD); - #endif - float elapsed; - hipEventElapsedTime(&elapsed, event_array[0], event_array[1]); - total_kern_time += elapsed * 1000000; - analyze_kern_time += elapsed * 1000000; - #endif - analyze_kern_flops = (2 * (double)nNZ * 1000000000.) / (double)analyze_kern_time; - this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_FALSE, NULL); - this->GPU->CopyToHost(maxDepthDev, &maxDepth, sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToHost(doneArrayDev, doneArray, (nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToHost(totalSpinDev, &totalSpin, sizeof(uint64_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToHost(numRowsAtLevelDev, numRowsAtLevel, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL); - this->GPU->Flush(); - - #ifdef USE_ROCSHMEM - // Combine global statistics - MPI_Allreduce(MPI_IN_PLACE, (void *) &maxDepth, 1, MPI_UNSIGNED, MPI_MAX, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, (void *) &totalSpin, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD); - - // TODO: Broadcast out the doneArray and yDev values to all nodes. This - // is needed for the below calculations to work since in the 'pull' - // distributed model we don't request data for rows that we don't - // have a dependency on. - #endif - - bool verify = InputFlags::GetValueBool("verify"); - if (verify) { - printf("Performing results verification\n"); - errors_seen[i] = VerifyResults(i); - } - printf("\nTotalSpin: %lu\n", totalSpin); - - /* Prefix sum of the number of rows at each level, so that we can - * calculate how much to offset each level into the rowMap */ - // TODO -- Do this prefix sum on the GPU while copying maxDepth and - // doneArray back into the host. Set non-blocking on the previous ones. - this->GPU->CopyToHost(numRowsAtLevelDev, numRowsAtLevel, nRows*sizeof(uint32_t), 0, GPU_TRUE, NULL); - for (unsigned int joe = 1; joe < maxDepth; joe++) - numRowsAtLevel[joe] = numRowsAtLevel[joe] + numRowsAtLevel[joe-1]; - - /* Build up the rowMap so that each iteration of the no-wait solve - * knows what it's global_id->row mapping is. - * The general mechanism for this is as follows: - * doneArray[row] holds the level that a particular row is in. - * - * We know the total number of levels needed (maxDepth), so rowMap - * has maxDepth 'buckets'. - * - * The numRowsAtLevel array (after the above prefix-sum) tells us - * how many values are in all of the previous buckets, so that - * we can get an appropriate array offset for each bucket. - * - * The counters array keeps track of how many items are in each - * bucket so far. Add this to the numRowsAtLevel[] offset. - * - * As we walk through all the rows, we check to see which level's - * bucket we should put this row in. Add it at the end of the - * current bucket, then increment the counter. */ - uint32_t *counters = (uint32_t *)calloc(maxDepth, sizeof(uint32_t)); -/* for (unsigned int this_row = 0; this_row < nRows; this_row++) - { - // We must subtract one here, because the first level is '1' - // The GPU kernel does that because a value of '0' means - // 'not done, keep waiting' in the analysis kernel. - assert(doneArray[this_row] != 0); - unsigned int this_rows_level = doneArray[this_row] - 1; - unsigned int previous_level = this_rows_level - 1; - unsigned int depth_offset; - if (this_rows_level == 0) // can't check previous level - depth_offset = 0; - else - depth_offset = numRowsAtLevel[previous_level]; - rowMap[depth_offset + counters[this_rows_level]] = this_row; - counters[this_rows_level] += 1; - } */ - free(counters); - this->GPU->CopyToDevice(rowMapDev, rowMap, (nRows+1)*sizeof(uint32_t), 0, GPU_TRUE, NULL); - free(event_array); - #ifdef USE_HIP - event_array = (gpuEvent*)malloc(maxDepth * sizeof(gpuEvent) * 2); - for (int i = 0; i < maxDepth * 2; i++) - hipEventCreate(&event_array[i]); - #else - event_array = (gpuEvent*)malloc(maxDepth * sizeof(gpuEvent)); - #endif -#ifdef ALL_ANALYZE - // We will be coming back into this kernel. Time to reset its data. - if (i != (iter - 1)) - { - this->GPU->CopyToDevice(maxDepthDev, &u32_zero, sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToDevice(totalSpinDev, &u64_zero, sizeof(uint64_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToDevice(numRowsAtLevelDev, nrows_plus1_zero, nRows*sizeof(uint32_t), 0, GPU_FALSE, NULL); - } -#endif - this->GPU->CopyToDevice(yDev, y_zero, nRows*sizeof(FloatType), 0, GPU_FALSE, NULL); - this->GPU->CopyToDevice(doneArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToDevice(shadowDoneArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToDevice(remoteInProgressArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->CopyToDevice(reqUpdateArrayDev, nrows_plus1_zero, (nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->Flush(); - // Either we always want to run just this function block - // (ALL_ANALYZE), or the first iteration is the analyze-and-solve - // kernel. Either way, don't continue to the code below this time. - continue; - } -#endif - // If ALL_SYNCFREE is defined, we always run the amd_spts_syncfree_solve - // kernel. We never try to speed it up by paying attention to the output - // levels and running the levelset kernel. - // If ALL_LEVELSET is set, we only run the analysis kernel up above to get the - // level-set and do the first solve; after that we skip the - // amd_spts_analyze_and_solve kernel and do the level-set based solve. - // Otherwise, we dynamically choose between those kernels based on some - // statistics that we gathered during the analyze-and-solve run. -#ifdef ALL_SYNCFREE - if (1) // always run syncfree algorithm -#elif defined(ALL_LEVELSET) || defined(ALL_LEVELSYNC) - if (0) // always *do not* run syncfree algorithm -#else - if (totalSpin == 0 || analyze_kern_flops/totalSpin > 25000 || syncfree_better) // Try to run syncfree -#endif - { - syncfree_iter++; - // TODO -- Eventually get this working with numer of RowBlocks - 1 - global_work_size = nRows * WF_SIZE; - - uint32_t current_iteration = 0; - - #ifdef USE_ROCSHMEM - fprintf(stderr, "rocSHMEM not supported for selected algorithm\n"); - exit(-1); - #endif - - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - CL->SetArgs(CLHelper::SpTSKernel, 0, - bufNonZeroes, - bufColumnIndices, - bufRowPtrs, - xDev, - yDev, - alpha, - doneArrayDev, - numRowsAtLevelDev, - maxDepthDev, - totalSpinDev); - - status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel, 1, NULL, &global_work_size, NULL, 0, NULL, &event_array[0]); - CL->checkStatus(status,"clEnqueueNDRangeKernel failed"); - current_iteration++; - this->GPU->Flush(); - total_kern_time += CL->ComputeTime(event_array[0]); - syncfree_kern_time += CL->ComputeTime(event_array[0]); - #else - int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - hipEventRecord(event_array[0], NULL); - hipLaunchKernelGGL(amd_spts_syncfree_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - 0, 0, - global_work_size, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - alpha, - static_cast(doneArrayDev), - static_cast(numRowsAtLevelDev), - static_cast(maxDepthDev), - static_cast(totalSpinDev)); - hipEventRecord(event_array[1], NULL); - hipEventSynchronize(event_array[1]); - current_iteration++; - float elapsed; - hipEventElapsedTime(&elapsed, event_array[0], event_array[1]); - total_kern_time += elapsed * 1000000; - syncfree_kern_time += elapsed * 1000000; - - #endif - - this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL); - errors_seen[i] = VerifyResults(i); - - this->GPU->Flush(); - current_iteration = 0; - completedRows = 0; - } -#if defined(ALL_LEVELSYNC) - else if (1) // always run levelset+syncfree combination -#elif defined (ALL_LEVELSET) - else if (0) // Fall through to level-set -#else - else if (1) // always run levelset+syncfree, never fall through to level-set only -#endif - { - // This is the "level-sync" algorithm, where we take the level-set - // information and launch kernels that combine multiple levels - // together. This allows us to find parallelism to run on the GPU, - // even if technically there are some data dependencies between the - // levels. Within the kernel, we use the synchronization-free algorithm - // to ensure that we get the right answer. - // This algorithm reduces the spin-loop overhead of the sync-free - // algorithm if there are many levels, but it finds more parallelism - // than the pure level-set algorithms which can only run on one CU. - levelsync_iter++; - - // Keep track of total kernels we launch so we can watch for events. - int total_enqueues = 0; - - /* The rowMap tells each workgroup within the kernel what - * rows it is working on. However, each each kernel invocation - * is working on a different level. Each level is in a separate - * 'bucket' in the rowMap. We must tell each invocation how far - * into the rowMap it much index. That's the depth_offset. - * numRowsAtLevel (after the above prefix-sum) tells us how - * many rows were in all previous levels combined. */ - unsigned int depth_offset = 0; - unsigned int running_total = 0; // How many rows in this launch - - if (level_sync_cutoff == 0) - { - if (nRows/maxDepth < 32) - level_sync_cutoff = 2560; - else - level_sync_cutoff = 81920; - } - - #ifdef USE_ROCSHMEM - fprintf(stderr, "rocSHMEM not supported for selected algorithm\n"); - exit(-1); - #endif - - for (int this_level = 0; this_level < maxDepth; this_level++) - { - if (this_level != 0 && running_total == 0) - depth_offset = numRowsAtLevel[this_level-1]; - - running_total = numRowsAtLevel[this_level] - depth_offset; - - if (running_total >= level_sync_cutoff) - { - global_work_size = (running_total + (running_total % WF_PER_WG)) * WF_SIZE; - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - CL->SetArgs(CLHelper::SpTSKernel_levelsync, 0, - bufNonZeroes, - bufColumnIndices, - bufRowPtrs, - xDev, - yDev, - alpha, - doneArrayDev, - rowMapDev, - depth_offset); - status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_levelsync, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &event_array[total_enqueues]); - this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed"); - #else - int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - hipEventRecord(event_array[total_enqueues * 2], NULL); - hipLaunchKernelGGL(amd_spts_levelsync_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - 0, 0, - global_work_size, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - alpha, - static_cast(doneArrayDev), - static_cast(rowMapDev), - depth_offset); - hipEventRecord(event_array[total_enqueues * 2 + 1], NULL); - #endif - total_enqueues++; - running_total = 0; - } - } - if (running_total) - { - global_work_size = (running_total + (running_total % WF_PER_WG)) * WF_SIZE; - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - CL->SetArgs(CLHelper::SpTSKernel_levelsync, 0, - bufNonZeroes, - bufColumnIndices, - bufRowPtrs, - xDev, - yDev, - alpha, - doneArrayDev, - rowMapDev, - depth_offset); - status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_levelsync, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &event_array[total_enqueues]); - this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed"); - #else - int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - hipEventRecord(event_array[total_enqueues * 2], NULL); - hipLaunchKernelGGL(amd_spts_levelsync_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - 0, 0, - global_work_size, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - alpha, - static_cast(doneArrayDev), - static_cast(rowMapDev), - depth_offset); - hipEventRecord(event_array[total_enqueues * 2 + 1], NULL); - #endif - total_enqueues++; - } - - // After we cross this clFinish, all of the kernel invocations have - // completed, and the final answer is in yDev. Now we should add up - // all of the kernel runtimes from all levels to see how long this - // levelset solve took. - this->GPU->Flush(); - for (int this_enqueue = 0; this_enqueue < total_enqueues; this_enqueue++) - { - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - total_kern_time += CL->ComputeTime(event_array[this_enqueue]); - levelsync_kern_time += CL->ComputeTime(event_array[this_enqueue]); - #else - float elapsed; - hipEventElapsedTime(&elapsed, event_array[this_enqueue * 2], event_array[this_enqueue * 2 + 1]); - total_kern_time += elapsed * 1000000; - levelsync_kern_time += elapsed * 1000000; - #endif - } - // The analyze kernel is about 15% slower than the syncfree kernel. - // As such, if the level-sync verseion is < 15% faster, it's likely - // that syncfree will win. Let's go back to doing that. - if (i == 1 && (analyze_kern_time < (levelsync_kern_time * 1.15))) - syncfree_better = true; - this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL); - errors_seen[i] = VerifyResults(i); - } - else // Run level-set algorithm - { - // This is the level-set SpTS kernel, which can be done after the - // first analyze-and-solve kernel. In this case, we know the levels - // that each row is in, so we can launch one kernel per level with - // exactly the right number of workgroups (one WG per row). - // This means that we don't have any in-kernel atomics, spin-loops, - // etc, so things run much faster. However, we much launch a - // potentially large number of kernels. - // Number of levels is maxDepth. */ - levelset_iter++; - - #ifdef USE_ROCSHMEM - fprintf(stderr, "rocSHMEM not supported for selected algorithm\n"); - exit(-1); - #endif - - // Keep track of total kernels we launch so we can watch for events. - int total_enqueues = 0; - - unsigned int start_level = 0; - unsigned int end_level = 0; - unsigned int in_a_run = 0; - unsigned int running_total = 0; - - // How far into the rowMap that lists which rows are in each level - unsigned int depth_offset = 0; - - unsigned int total_vector = 0; - unsigned int total_levelset = 0; - for (int this_level = 0; this_level < maxDepth; this_level++) - { - unsigned int inner_depth_offset; - if (this_level == 0) - inner_depth_offset = 0; - else - inner_depth_offset = numRowsAtLevel[this_level-1]; - unsigned int total_in_this_depth = numRowsAtLevel[this_level] - inner_depth_offset; - - if (total_in_this_depth == 0) - continue; - - end_level = this_level; - // Comment out this if(){} section to force us to always - // launch the levelset kernel. - if (total_in_this_depth <= 2*WF_PER_WG) - { - running_total += total_in_this_depth; - if (in_a_run == 0) - { - start_level = this_level; - depth_offset = inner_depth_offset; - in_a_run = 1; - } - } - else - { - if (in_a_run) - { - global_work_size = WF_SIZE * WF_PER_WG; - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - CL->SetArgs(CLHelper::SpTSKernel_vector, 0, - bufNonZeroes, - bufColumnIndices, - bufRowPtrs, - xDev, - yDev, - alpha, - rowMapDev, - numRowsAtLevelDev, - depth_offset, - start_level, - end_level); - status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_vector, 1, NULL, &global_work_size, &global_work_size, 0, NULL, &event_array[total_enqueues]); - this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed"); - #else - int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - hipEventRecord(event_array[total_enqueues * 2], NULL); - hipLaunchKernelGGL(amd_spts_vector_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - 0, 0, - global_work_size, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - alpha, - static_cast(rowMapDev), - static_cast(numRowsAtLevelDev), - depth_offset, - start_level, - end_level); - hipEventRecord(event_array[total_enqueues * 2 + 1], NULL); - #endif - total_enqueues++; - //printf("\n\tVector. offset %u Start %u End %u Rows in this enq %u\n", depth_offset, start_level, end_level, running_total); - in_a_run = start_level = end_level = running_total = 0; - depth_offset = numRowsAtLevel[this_level-1]; - total_vector++; - } - global_work_size = WF_SIZE * total_in_this_depth; - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - CL->SetArgs(CLHelper::SpTSKernel_levelset, 0, - bufNonZeroes, - bufColumnIndices, - bufRowPtrs, - xDev, - yDev, - rowMapDev, - depth_offset, - alpha); - status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_levelset, 1, NULL, &global_work_size, NULL, 0, NULL, &event_array[total_enqueues]); - this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed"); - #else - int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - hipEventRecord(event_array[total_enqueues * 2], NULL); - hipLaunchKernelGGL(amd_spts_levelset_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - 0, 0, - global_work_size, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - static_cast(rowMapDev), - depth_offset, - alpha); - hipEventRecord(event_array[total_enqueues * 2 + 1], NULL); - #endif - total_enqueues++; - depth_offset = numRowsAtLevel[this_level]; - total_levelset++; - } - } - end_level++; - if (in_a_run) - { - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - CL->SetArgs(CLHelper::SpTSKernel_vector, 0, - bufNonZeroes, - bufColumnIndices, - bufRowPtrs, - xDev, - yDev, - alpha, - rowMapDev, - numRowsAtLevelDev, - depth_offset, - start_level, - end_level); - global_work_size = WF_SIZE * WF_PER_WG; - status = clEnqueueNDRangeKernel(CLHelper::commandQueue, CLHelper::SpTSKernel_vector, 1, NULL, &global_work_size, &global_work_size, 0, NULL, &event_array[total_enqueues]); - this->GPU->checkStatus(status,"clEnqueueNDRangeKernel failed"); - #else - int num_of_workgroups = (global_work_size + total_workitems_per_workgroup - 1) - / total_workitems_per_workgroup; - hipEventRecord(event_array[total_enqueues * 2], NULL); - hipLaunchKernelGGL(amd_spts_vector_solve, - dim3(num_of_workgroups), - dim3(total_workitems_per_workgroup), - 0, 0, - global_work_size, - static_cast(bufNonZeroes), - static_cast(bufColumnIndices), - static_cast(bufRowPtrs), - static_cast(xDev), - static_cast(yDev), - alpha, - static_cast(rowMapDev), - static_cast(numRowsAtLevelDev), - depth_offset, - start_level, - end_level); - hipEventRecord(event_array[total_enqueues * 2 + 1], NULL); - #endif - total_enqueues++; - //printf("\n\tVector. offset %u Start %u End %u Rows in this enq %u\n", depth_offset, start_level, end_level, running_total); - in_a_run = start_level = end_level = running_total = 0; - total_vector++; - } - - if (i == 1) - printf("\nTotal Vector: %u\nTotal levelset: %u\n", total_vector, total_levelset); - // After we cross this clFinish, all of the kernel invocations have - // completed, and the final answer is in yDev. Now we should add up - // all of the kernel runtimes from all levels to see how long this - // levelset solve took. - this->GPU->Flush(); - for (int this_enqueue = 0; this_enqueue < total_enqueues; this_enqueue++) - { - #ifndef USE_HIP - CLHelper *CL = dynamic_cast(this->GPU); - total_kern_time += CL->ComputeTime(event_array[this_enqueue]); - levelset_kern_time += CL->ComputeTime(event_array[this_enqueue]); - #else - float elapsed; - hipEventElapsedTime(&elapsed, event_array[this_enqueue * 2], event_array[this_enqueue * 2 + 1]); - total_kern_time += elapsed * 1000000; - levelset_kern_time += elapsed * 1000000; - #endif - } - this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL); - errors_seen[i] = VerifyResults(i); - } - -#ifndef ALL_SYNCFREE - if (i == 1) - printf("\nmaxDepth %d\n", maxDepth); -#endif - if (i != (iter - 1)) - { - this->GPU->CopyToDevice(yDev, y_zero, nRows*sizeof(FloatType), 0, GPU_FALSE, NULL); - this->GPU->CopyToDevice(doneArrayDev, nrows_plus1_zero,(nRows+1)*sizeof(uint32_t), 0, GPU_FALSE, NULL); - this->GPU->Flush(); - } - } - - float gflops = 0.f; - printf("\n\nnnz: %d\n", nNZ); - gflops = (float)(2 * nNZ) / (float)(total_kern_time/iter); - ns_per_iter = total_kern_time/iter; - - if (analysis_iter > 0) - ns_per_analysis_iter = analyze_kern_time / analysis_iter; - else - ns_per_analysis_iter = 0; - if (syncfree_iter > 0) - ns_per_syncfree_iter = syncfree_kern_time / syncfree_iter; - else - ns_per_syncfree_iter = 0; - if (levelset_iter > 0) - ns_per_levelset_iter = levelset_kern_time / levelset_iter; - else - ns_per_levelset_iter = 0; - if (levelsync_iter > 0) - ns_per_levelsync_iter = levelsync_kern_time / levelsync_iter; - else - ns_per_levelsync_iter = 0; - - this->GPU->CopyToHost(yDev, y, nRows*sizeof(FloatType), 0, GPU_TRUE, NULL); - - if (doneArray) - free(doneArray); - if (numRowsAtLevel) - free(numRowsAtLevel); - if (rowMap) - free(rowMap); - if (event_array) - free(event_array); - - return gflops; -} - -#endif //SpTS_H diff --git a/projects/rocshmem/internal/clients/spts/SparseMatrix.h b/projects/rocshmem/internal/clients/spts/SparseMatrix.h deleted file mode 100644 index bd36d65a00..0000000000 --- a/projects/rocshmem/internal/clients/spts/SparseMatrix.h +++ /dev/null @@ -1,287 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ -#ifndef SparseMatrix_H -#define SparseMatrix_H - -#include "GPUHelper.h" -#ifndef USE_HIP -#include "OpenCLHelper.h" -#include -#else -#include "HIPHelper.h" -#endif - -#include "InputFlags.h" -#include "MatrixMarketReader.h" -#include "OpenCLHelper.h" -#include -#include - -template -class SparseMatrix -{ - - public: - int nRows; - int nCols; - int nNZ; - - int *cols; - int *row_ptrs; - - FloatType *vals; - - memPointer d_cols; - memPointer d_vals; - memPointer d_row_ptrs; - - // info about parallel procs - int this_pe; - int total_pes; - - int nRows_p; - int nCols_p; - - protected: - - GPUHelper *GPU; - - public: - - SparseMatrix() : nRows(0), nCols(0), nNZ(0), nRows_p(0), nCols_p(0) - { - cols = NULL; - row_ptrs = NULL; - vals = NULL; - - d_cols = NULL; - d_vals = NULL; - d_row_ptrs = NULL; - - this_pe = -1;//rocshmem_my_pe(handle); // this pe - total_pes = -1;//rocshmem_n_pes(handle); // total number of pes - - } - void AllocateSparseMatrix(MatrixMarketReader &mm_reader, - InputFlags &in_flags, - GPUHelper *gpu); - void AllocateParallelSparseMatrix(MatrixMarketReader &mm_reader, - InputFlags &in_flags); - void ConvertFromCOOToCSR(Coordinate *coords, - InputFlags &in_flags); - - void PopulateParallelSparseMatrix(MatrixMarketReader &mm_reader, - InputFlags &in_flags); - - void FindStatsForParallelDecomposition(); - - void Set_total_pes(int val){ - this->total_pes = val; - } - void Set_this_pe(int val){ - this->this_pe = val; - } - - int Get_total_pes(){ - return this->total_pes; - } - int Get_this_pe(){ - return this->this_pe; - } - - int GetNumRows_p() {return nRows_p;} - - int *GetCols() { return cols; } - FloatType *GetVals() { return vals; } - int *GetRowPtrs() { return row_ptrs; } - - memPointer GetDevCols() {return d_cols; } - memPointer GetDevVals() {return d_vals; } - memPointer GetDevRowPtrs() {return d_row_ptrs; } - - ~SparseMatrix() - { - delete[] cols; - delete[] vals; - delete[] row_ptrs; - - GPU->FreeMem(d_cols); - GPU->FreeMem(d_vals); - GPU->FreeMem(d_row_ptrs); - } -}; - -template -void SparseMatrix::AllocateSparseMatrix(MatrixMarketReader &mm_reader, - InputFlags &in_flags, - GPUHelper *gpu) -{ - GPU = gpu; - nRows = mm_reader.GetNumRows(); - nCols = mm_reader.GetNumCols(); - nNZ = mm_reader.GetNumNonZeroes(); - printf("Allocating a sparse matrix with-- nRows: %d nCols: %d nNZ: %d\n", nRows, nCols, nNZ); - - assert(total_pes != -1); - assert(this_pe != -1); - - #ifdef USE_RO_SHMEM - if (nRows != nCols){ - fprintf(stderr, "RO_SHMEM port requires the global matrix to be " - "square!\n"); - exit(-1); - } - #endif - - cols = new int[nNZ]; - if (cols == NULL) - { - fprintf(stderr, "Failed to allocate host-side cols array !\n"); - exit(-1); - } - vals = new FloatType[nNZ]; - if (vals == NULL) - { - fprintf(stderr, "Failed to allocate host-side vals array !\n"); - exit(-1); - } - row_ptrs = new int[nRows + 1]; - if (row_ptrs == NULL) - { - fprintf(stderr, "Failed to allocate host-side row_ptrs array !\n"); - exit(-1); - } -} - -template -bool CoordinateCompare(const Coordinate &c1, const Coordinate &c2) -{ - if(c1.x != c2.x) - return (c1.x < c2.x); - else - return (c1.y < c2.y); -} - -template -void SparseMatrix::ConvertFromCOOToCSR(Coordinate *coords, - InputFlags &in_flags) -{ - std::sort(coords, coords + nNZ, CoordinateCompare); - - int current_row = 1; - bool has_seen_diagonal = false; - row_ptrs[0] = 0; - for (int i = 0; i < nNZ; i++) - { - cols[i] = coords[i].y; - vals[i] = coords[i].val; - //fprintf(stderr,"Row %d Col %d Val %lf (cur_row: %d)\n", coords[i].x, coords[i].y, coords[i].val, current_row-1); - - while(coords[i].x >= current_row) - { - // We've reached the end of a row. Did we see a diagonal? - // If not, the triangular solve will be underconstrained. - if (!has_seen_diagonal) - { - fprintf(stderr, "ERROR Converting the COO to CSR.\n"); - fprintf(stderr, "\tMissing diagonal on row %d\n", current_row-1); - exit(-1); - } - has_seen_diagonal = false; - row_ptrs[current_row] = i; - current_row++; - } - if (coords[i].x == coords[i].y) - has_seen_diagonal = true; - - } - row_ptrs[current_row++] = nNZ; - while (current_row <= nRows) - { - if (!has_seen_diagonal) - { - fprintf(stderr, "ERROR Converting the COO to CSR.\n"); - fprintf(stderr, "\tNo values on row %d, so no diagonal.\n", current_row-1); - exit(-1); - } - has_seen_diagonal = false; - row_ptrs[current_row++] = nNZ; - } -} - -template -void SparseMatrix::AllocateParallelSparseMatrix(MatrixMarketReader &mm_reader, - InputFlags &in_flags) -{ - d_cols = GPU->AllocateMem("cols", nNZ*sizeof(int), 0, NULL); - d_vals = GPU->AllocateMem("vals", nNZ*sizeof(FloatType), 0, NULL); - d_row_ptrs = GPU->AllocateMem("row_ptrs", (nRows+1)*sizeof(int), 0, NULL); -} - -template -void SparseMatrix::FindStatsForParallelDecomposition() -{ - - assert(SPTS_BLOCK_SIZE % 64 == 0); - - // Rows left over in the potentially partial final block - int left_over_last_block = nRows % SPTS_BLOCK_SIZE; - printf("%d: lolb %d\n", this_pe, left_over_last_block); - // Number of complete blocks, not including any partial block at the end - int total_blocks = nRows / SPTS_BLOCK_SIZE; - printf("%d: totb %d\n", this_pe, total_blocks); - - // Everyone has at least this many rows - nRows_p = (total_blocks / total_pes) * SPTS_BLOCK_SIZE; - printf("%d: initial nRows_p %d\n", this_pe, nRows_p); - - // Last cycle might not assign to all PEs - int straggler_blocks = total_blocks % total_pes; - if (this_pe < straggler_blocks) - nRows_p += SPTS_BLOCK_SIZE; - printf("%d: straggler nRows_p %d\n", this_pe, nRows_p); - - // Last block of last cycle might have less than SPTS_BLOCK_SIZE rows - if (left_over_last_block) { - int final_pe = ((total_blocks + 1) % total_pes) - 1; - if (final_pe == -1) - final_pe = total_pes - 1; - if (this_pe == final_pe) - nRows_p += left_over_last_block; - } - printf("%d: final nRows_p %d\n", this_pe, nRows_p); - - if (nRows_p <= 0) { - fprintf(stderr, "Block Size %d too small for input row size %d with " - "%d number of nodes. Please decrease the block size or " - "decrease the number of nodes\n", SPTS_BLOCK_SIZE, nRows, - total_pes); - exit(-1); - } - - // print to check! - printf("\nPE: %d total_rows: %d my_rows: %d\n", this_pe, nRows, nRows_p); - - nCols_p = nCols; // 1D decomposition -} - -#endif diff --git a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_hip b/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_hip deleted file mode 100755 index 03b392ace3..0000000000 --- a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_hip +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -src_path=$(dirname "$(realpath $0)")/.. - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DUSE_ROCSHMEM=OFF \ - -DUSE_HIP=ON \ - -DALL_ANALYZE=ON \ - -DUSE_DOUBLE=OFF \ - -DALL_LEVELSET=OFF \ - -DALL_LEVELSYNC=OFF \ - -DALL_SYNCFREE=OFF \ - $src_path -cmake --build . --parallel 8 diff --git a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_opencl b/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_opencl deleted file mode 100755 index 41db75f17f..0000000000 --- a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_opencl +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -src_path=$(dirname "$(realpath $0)")/.. - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DUSE_ROCSHMEM=OFF \ - -DUSE_HIP=OFF \ - -DALL_ANALYZE=ON \ - -DUSE_DOUBLE=OFF \ - -DALL_LEVELSET=OFF \ - -DALL_LEVELSYNC=OFF \ - -DALL_SYNCFREE=OFF \ - $src_path -cmake --build . --parallel 8 diff --git a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_rocshmem b/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_rocshmem deleted file mode 100755 index c542aec341..0000000000 --- a/projects/rocshmem/internal/clients/spts/build_configs/analyze_single_rocshmem +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -if [ -z $1 ] -then - install_path=~/rocshmem -else - install_path=$1 -fi - -src_path=$(dirname "$(realpath $0)")/.. - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_VERBOSE_MAKEFILE=OFF \ - -DUSE_ROCSHMEM=ON \ - -DUSE_HIP=ON \ - -DALL_ANALYZE=ON \ - -DUSE_DOUBLE=OFF \ - -DALL_LEVELSET=OFF \ - -DALL_LEVELSYNC=OFF \ - -DALL_SYNCFREE=OFF \ - -Drocshmem_DIR=$install_path/share/cmake/rocshmem \ - $src_path -cmake --build . --parallel 8 diff --git a/projects/rocshmem/internal/clients/spts/config.h.in b/projects/rocshmem/internal/clients/spts/config.h.in deleted file mode 100644 index a9d4d814a2..0000000000 --- a/projects/rocshmem/internal/clients/spts/config.h.in +++ /dev/null @@ -1,7 +0,0 @@ -#cmakedefine USE_ROCSHMEM -#cmakedefine USE_HIP -#cmakedefine ALL_ANALYZE -#cmakedefine USE_DOUBLE -#cmakedefine ALL_LEVELSET -#cmakedefine ALL_LEVELSYNC -#cmakedefine ALL_SYNCFREE diff --git a/projects/rocshmem/internal/clients/spts/driver.sh b/projects/rocshmem/internal/clients/spts/driver.sh deleted file mode 100755 index 418ebdd525..0000000000 --- a/projects/rocshmem/internal/clients/spts/driver.sh +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -#!/bin/bash - -echo Test Name $2 - -INPUTS=/mnt/mlebeane/spts_data - -case $2 in - *"single_thread") - mpirun -np 2 $1 -f $INPUTS/test_matrices/diagonal_large.mtx -a 2 -b 512 -p 64 -v -i 3 > $3/diagonal_large_bput.log - mpirun -np 2 $1 -f $INPUTS/test_matrices/not_quite_diagonal.mtx -a 2 -b 256 -p 64 -v -i 3 > $3/not_quite_diagonal_bput.log - ;; - *"multi_thread") - mpirun -np 2 $1 -f $INPUTS/test_matrices/diagonal_large.mtx -a 2 -b 512 -p 64 -v -i 3 > $3/diagonal_large_bput.log - mpirun -np 2 $1 -f $INPUTS/test_matrices/not_quite_diagonal.mtx -a 2 -b 256 -p 64 -v -i 3 > $3/not_quite_diagonal_bput.log - mpirun -np 2 $1 -f $INPUTS/test_matrices/not_quite_diagonal.mtx -a 1 -b 256 -v -i 3 > $3/not_quite_diagonal_get.log - ;; - *) - echo "UNKNOWN TEST TYPE: $2" - exit -1 - ;; -esac - -exit $? diff --git a/projects/rocshmem/internal/clients/spts/mmio.h b/projects/rocshmem/internal/clients/spts/mmio.h deleted file mode 100644 index b83946d231..0000000000 --- a/projects/rocshmem/internal/clients/spts/mmio.h +++ /dev/null @@ -1,86 +0,0 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ - -#ifndef MM_IO_H -#define MM_IO_H - -/********************* MM_typecode query fucntions ***************************/ - -#define mm_is_matrix(typecode) ((typecode)[0]=='M') - -#define mm_is_sparse(typecode) ((typecode)[1]=='C') -#define mm_is_coordinate(typecode)((typecode)[1]=='C') -#define mm_is_dense(typecode) ((typecode)[1]=='A') -#define mm_is_array(typecode) ((typecode)[1]=='A') - -#define mm_is_complex(typecode) ((typecode)[2]=='C') -#define mm_is_real(typecode) ((typecode)[2]=='R') -#define mm_is_pattern(typecode) ((typecode)[2]=='P') -#define mm_is_integer(typecode) ((typecode)[2]=='I') - -#define mm_is_symmetric(typecode)((typecode)[3]=='S') -#define mm_is_general(typecode) ((typecode)[3]=='G') -#define mm_is_skew(typecode) ((typecode)[3]=='K') -#define mm_is_hermitian(typecode)((typecode)[3]=='H') - -/********************* MM_typecode modify fucntions ***************************/ - -#define mm_set_matrix(typecode) ((typecode)[0]='M') -#define mm_set_coordinate(typecode) ((typecode)[1]='C') -#define mm_set_array(typecode) ((typecode)[1]='A') -#define mm_set_dense(typecode) mm_set_array(typecode) -#define mm_set_sparse(typecode) mm_set_coordinate(typecode) - -#define mm_set_complex(typecode)((typecode)[2]='C') -#define mm_set_real(typecode) ((typecode)[2]='R') -#define mm_set_pattern(typecode)((typecode)[2]='P') -#define mm_set_integer(typecode)((typecode)[2]='I') - - -#define mm_set_symmetric(typecode)((typecode)[3]='S') -#define mm_set_general(typecode)((typecode)[3]='G') -#define mm_set_skew(typecode) ((typecode)[3]='K') -#define mm_set_hermitian(typecode)((typecode)[3]='H') - -#define mm_clear_typecode(typecode) ((typecode)[0]=(typecode)[1]= \ - (typecode)[2]=' ',(typecode)[3]='G') - -#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) - - -/********************* Matrix Market error codes ***************************/ - - -#define MM_COULD_NOT_READ_FILE 11 -#define MM_PREMATURE_EOF 12 -#define MM_NOT_MTX 13 -#define MM_NO_HEADER 14 -#define MM_UNSUPPORTED_TYPE 15 -#define MM_LINE_TOO_LONG 16 -#define MM_COULD_NOT_WRITE_FILE 17 - -#define MM_MTX_STR "matrix" -#define MM_ARRAY_STR "array" -#define MM_DENSE_STR "array" -#define MM_COORDINATE_STR "coordinate" -#define MM_SPARSE_STR "coordinate" -#define MM_COMPLEX_STR "complex" -#define MM_REAL_STR "real" -#define MM_INT_STR "integer" -#define MM_GENERAL_STR "general" -#define MM_SYMM_STR "symmetric" -#define MM_HERM_STR "hermitian" -#define MM_SKEW_STR "skew-symmetric" -#define MM_PATTERN_STR "pattern" - -#define MM_MAX_LINE_LENGTH 1025 -#define MM_MAX_TOKEN_LENGTH 64 -#define MatrixMarketBanner "%%MatrixMarket" -#define MAX_RAND_VAL 5.0 - -#endif diff --git a/projects/rocshmem/internal/clients/spts/spts_kernel.h b/projects/rocshmem/internal/clients/spts/spts_kernel.h deleted file mode 100644 index 69a7c458bb..0000000000 --- a/projects/rocshmem/internal/clients/spts/spts_kernel.h +++ /dev/null @@ -1,2107 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - ********************************************************************************/ - -#include "GPUHelper.h" - -#include -#include -#include - -#ifdef USE_ROCSHMEM -#include "rocshmem.hpp" -using namespace rocshmem; -#endif - -#ifndef WF_PER_WG -#error "WF_PER_WG undefined!" -#endif - -#ifndef WF_SIZE -#error "WF_SIZE undefind!" -#endif - -#define as_uint (unsigned int) -#define as_ulong (unsigned long long) -#define as_float (float) - -#ifdef USE_DOUBLE -typedef double FPTYPE; -#else -typedef float FPTYPE; -#endif - -// GCN3 and below require slightly different inline asm than Vega -// v_add_u32 requires a "vcc" register output modifier on GCN3, but not on Vega -// global_load_ in Vega is required to be flat_load_ in GCN3 and below. -// Same for global_store_ and flat_store_. -// However, the global_ instructions require an "off" modifier. -#if defined(GCN3) || defined(GCN2) -#define VCC "vcc" -#define MEM_PREFIX "flat" -#define OFF_MODIFIER "" -#else -#define VCC "" -#define MEM_PREFIX "global" -#define OFF_MODIFIER "off" -#endif - -#ifndef GCN2 -#define LGKMCNT_0 0xc07f // GCN3 added more VMCNT bits at the upper end of the SIMM16 -#define WAKEUP "s_wakeup" -#else -#define LGKMCNT_0 0x7f -#define WAKEUP "" // s_wakeup not supported on old GPUs -#endif - -#define __builtin_amdgcn_ds_bpermute __hip_ds_bpermute -#define __builtin_amdgcn_ds_swizzle __hip_ds_swizzle -#define __builtin_amdgcn_mov_dpp __hip_move_dpp - -#define HIP_ENABLE_PRINTF - -// Internal functions to wrap atomics, depending on if we support 64-bit -// atomics or not. Helps keep the code clean in the other parts of the code. -// All of the 32-bit atomics are built assuming we're on a little endian architecture. -__device__ -inline unsigned long spts_atomic_cmpxchg(unsigned long long *const ptr, - const unsigned long long compare, - const unsigned long long val) -{ -#ifdef USE_DOUBLE - return atomicCAS(ptr, compare, val); -#else - return atomicCAS(ptr, compare, val); -#endif -} - -__device__ -void atomic_set (FPTYPE *ptr, FPTYPE temp) -{ -#ifdef USE_DOUBLE - unsigned long long newVal; - unsigned long long prevVal; - do - { - prevVal = as_ulong(*ptr); - newVal = as_ulong(temp); - } while (spts_atomic_cmpxchg((unsigned long long *)ptr, prevVal, newVal) != prevVal); - -#else - unsigned long long newVal; - unsigned long long prevVal; - do - { - prevVal = as_uint(*ptr); - newVal = as_uint(temp); - } while (spts_atomic_cmpxchg((unsigned long long *)ptr, prevVal, newVal) != prevVal); -#endif -} - -__device__ -inline void atomic_set_done(uint * done_array, uint row, uint val_to_set) -{ - atomicOr(&(done_array[row]), val_to_set); -} - -__device__ -inline unsigned int atomic_get_done(uint * done_array, uint val_to_check) -{ - return atomicOr(&(done_array[val_to_check]), 0x0); -} - -// Use a traditional LDS-based reduction to have all of the threads in the wave -// add their values into OUTPUT_THREAD's variable. -__device__ -FPTYPE lds_reduction(FPTYPE temp_sum, __shared__ FPTYPE *lds, - unsigned int start_of_this_row, unsigned int end_of_this_row, - unsigned int wg_lid) -{ - const unsigned int lid = wg_lid % WF_SIZE; - - // Have all the threads in a workgroup reduce their data into a single - // value that's then read by the lead thread - // We start by calculating how many layers of reduction we actually need. - // If this is a very short row (smaller than our wavefront size), then we don't need - // to do all iterations of the below loop. - unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE); - // find next highest power of two. So if we have 5 things to reduce, we need to - // do a reduction from 8 threads' values. The last 3 will be '0' - num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1)); - - for (int i = num_items >> 1; i > 0; i >>= 1) - { - lds[wg_lid] = temp_sum; - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - - if (lid < i) - temp_sum += lds[wg_lid + i]; - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - } - // at this point, thread 0's "temp_sum" contains the final useful value. - return temp_sum; -} - -// Use a traditional LDS-based reduction to have all of the threads in the wave -// add their values into OUTPUT_THREAD's variable. -// It hides the max work behind the same s_waitcnt on local memory, -// so it should be faster than calling the reduce function twice in a row. -__device__ -FPTYPE lds_reduction_two(FPTYPE temp_sum, unsigned int row_max_depth, - __shared__ FPTYPE *lds, __shared__ unsigned int *max_depth, - unsigned int start_of_this_row, unsigned int end_of_this_row, - unsigned int wg_lid) -{ - const unsigned int lid = wg_lid % WF_SIZE; - - // Have all the threads in a workgroup reduce their data into a single - // value that's then read by the lead thread - // We start by calculating how many layers of reduction we actually need. - // If this is a very short row (smaller than our wavefront size), then we don't need - // to do all iterations of the below loop. - unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE); - // find next highest power of two. So if we have 5 things to reduce, we need to - // do a reduction from 8 threads' values. The last 3 will be '0' - num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1)); - - for (int i = num_items >> 1; i > 0; i >>= 1) - { - lds[wg_lid] = temp_sum; - max_depth[wg_lid] = row_max_depth; - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - if (lid < i) - { - temp_sum += lds[wg_lid + i]; - row_max_depth = max(row_max_depth, max_depth[wg_lid + i]); - } - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - } - // at this point, max_depth[thread_0_within_each_wavefront] - // contains the useful maximum depth for this row. - max_depth[wg_lid] = row_max_depth; - // at this point, thread 0's "temp_sum" contains the final useful value. - return temp_sum; -} - -// Use a traditional LDS-based reduction to have all of the threads in the wave -// add their values into OUTPUT_THREAD's variable. -// It hides the max work behind the same s_waitcnt on local memory, -// so it should be faster than calling the reduce function three times in a row. -__device__ -FPTYPE lds_reduction_three(FPTYPE temp_sum, unsigned int row_max_depth, - unsigned int spin_times, __shared__ FPTYPE *lds, - __shared__ unsigned int *max_depth, __shared__ unsigned int *total_spins, - unsigned int start_of_this_row, unsigned int end_of_this_row, - unsigned int wg_lid) -{ - const unsigned int lid = wg_lid % WF_SIZE; - - // Have all the threads in a workgroup reduce their data into a single - // value that's then read by the lead thread - // We start by calculating how many layers of reduction we actually need. - // If this is a very short row (smaller than our wavefront size), then we don't need - // to do all iterations of the below loop. - unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE); - // find next highest power of two. So if we have 5 things to reduce, we need to - // do a reduction from 8 threads' values. The last 3 will be '0' - num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1)); - - for (int i = num_items >> 1; i > 0; i >>= 1) - { - lds[wg_lid] = temp_sum; - max_depth[wg_lid] = row_max_depth; - total_spins[wg_lid] = spin_times; - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - if (lid < i) - { - temp_sum += lds[wg_lid + i]; - row_max_depth = max(row_max_depth, max_depth[wg_lid + i]); - spin_times += total_spins[wg_lid + i]; - } - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - } - // at this point, max_depth[thread_0_within_each_wavefront] - // contains the useful maximum depth for this row. - max_depth[wg_lid] = row_max_depth; - // and total_spins[thread_0_within_each_wavefront] has its - // total number of spin-loops. - total_spins[wg_lid] = spin_times; - // at this point, thread 0's "temp_sum" contains the final useful value. - return temp_sum; -} - -// Do a reduction using bpermute instructions. -// This is strictly worse than Swizzle-based reduction, since it is slower and -// only works on the same hardware as the swizzle instructions. -__device__ -FPTYPE bpermute_reduction(FPTYPE temp_sum, unsigned int start_of_this_row, - unsigned int end_of_this_row, unsigned int wg_lid) -{ - const unsigned int lid = wg_lid % WF_SIZE; - - // Have all the threads in a workgroup reduce their data into a single - // value that's then read by the lead thread - // We start by calculating how many layers of reduction we actually need. - // If this is a very short row (smaller than our workgroup size), then we don't need - // to do all iterations of the below loop. - unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE); - // find next highest power of two. So if we have 5 things to reduce, we need to - // do a reduction from 8 threads' values. The last 3 will be '0' - num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1)); - -#ifdef USE_DOUBLE - typedef union dbl_b32 { - double val; - uint2 b32; - } dbl_b32_t; - dbl_b32_t t_temp_sum; - t_temp_sum.val = temp_sum; - for (int i = num_items >> 1; i > 0; i >>= 1) - { - int pull_from = (lid + i) << 2; - dbl_b32_t upper_sum; - upper_sum.b32.x = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.x); - upper_sum.b32.y = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.y); - t_temp_sum.val += upper_sum.val; - } - temp_sum = t_temp_sum.val; -#else // !USE_DOUBLE - for (int i = num_items >> 1; i > 0; i >>= 1) - { - uint pull_from = (lid + i) << 2; - temp_sum += as_float(__builtin_amdgcn_ds_bpermute(pull_from, as_uint(temp_sum))); - } -#endif // USE_DOUBLE - return temp_sum; -} - -// Do a reduction using bpermute instructions. -// This is strictly worse than Swizzle-based reduction, since it is slower and -// only works on the same hardware as the swizzle instructions. -// This version also does a max-reduce on the row_max_depth variable. -// It hides this bpermute instruction behind the same s_waitcnt on local memory, -// so it should be faster than calling the reduce function twice in a row. -__device__ -FPTYPE bpermute_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth, - unsigned int start_of_this_row, unsigned int end_of_this_row, - unsigned int wg_lid) -{ - const unsigned int lid = wg_lid % WF_SIZE; - unsigned int max_depth = *row_max_depth; - - // Have all the threads in a workgroup reduce their data into a single - // value that's then read by the lead thread - // We start by calculating how many layers of reduction we actually need. - // If this is a very short row (smaller than our workgroup size), then we don't need - // to do all iterations of the below loop. - unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE); - // find next highest power of two. So if we have 5 things to reduce, we need to - // do a reduction from 8 threads' values. The last 3 will be '0' - num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1)); - -#ifdef USE_DOUBLE - typedef union dbl_b32 { - double val; - int2 b32; - } dbl_b32_t; - dbl_b32_t t_temp_sum; - t_temp_sum.val = temp_sum; - for (int i = num_items >> 1; i > 0; i >>= 1) - { - int pull_from = (lid + i) << 2; - dbl_b32_t upper_sum; - upper_sum.b32.x = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.x); - upper_sum.b32.y = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.y); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth))); - t_temp_sum.val += upper_sum.val; - } - temp_sum = t_temp_sum.val; -#else // !USE_DOUBLE - for (int i = num_items >> 1; i > 0; i >>= 1) - { - int pull_from = (lid + i) << 2; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth))); - temp_sum += as_float(__builtin_amdgcn_ds_bpermute(pull_from, as_uint(temp_sum))); - } -#endif // USE_DOUBLE - *row_max_depth = max_depth; - return temp_sum; -} - -// Do a reduction using bpermute instructions. -// This is strictly worse than Swizzle-based reduction, since it is slower and -// only works on the same hardware as the swizzle instructions. -// This version also does a max-reduce on the row_max_depth variable. -// This version also does a max-add on the spin-loops per thread variable. -// It hides this bpermute instruction behind the same s_waitcnt on local memory, -// so it should be faster than calling the reduce function thrice in a row. -__device__ -FPTYPE bpermute_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth, - unsigned int *spin_times, unsigned int start_of_this_row, - unsigned int end_of_this_row, unsigned int wg_lid) -{ - const unsigned int lid = wg_lid % WF_SIZE; - unsigned int max_depth = *row_max_depth; - unsigned int spin_time = *spin_times; - - // Have all the threads in a workgroup reduce their data into a single - // value that's then read by the lead thread - // We start by calculating how many layers of reduction we actually need. - // If this is a very short row (smaller than our workgroup size), then we don't need - // to do all iterations of the below loop. - unsigned int num_items = min(end_of_this_row - start_of_this_row - 1, (uint)WF_SIZE); - // find next highest power of two. So if we have 5 things to reduce, we need to - // do a reduction from 8 threads' values. The last 3 will be '0' - num_items = 1 << (CHAR_BIT*(sizeof(unsigned int))-__clz(num_items-1)); - -#ifdef USE_DOUBLE - typedef union dbl_b32 { - double val; - int2 b32; - } dbl_b32_t; - dbl_b32_t t_temp_sum; - t_temp_sum.val = temp_sum; - for (int i = num_items >> 1; i > 0; i >>= 1) - { - int pull_from = (lid + i) << 2; - dbl_b32_t upper_sum; - upper_sum.b32.x = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.x); - upper_sum.b32.y = __builtin_amdgcn_ds_bpermute(pull_from, t_temp_sum.b32.y); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth))); - spin_time += __builtin_amdgcn_ds_bpermute(pull_from, spin_time); - t_temp_sum.val += upper_sum.val; - } - temp_sum = t_temp_sum.val; -#else // !USE_DOUBLE - for (int i = num_items >> 1; i > 0; i >>= 1) - { - int pull_from = (lid + i) << 2; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_bpermute(pull_from, max_depth))); - spin_time += __builtin_amdgcn_ds_bpermute(pull_from, spin_time); - temp_sum += as_float(__builtin_amdgcn_ds_bpermute(pull_from, as_uint(temp_sum))); - } -#endif // USE_DOUBLE - *row_max_depth = max_depth; - *spin_times = spin_time; - return temp_sum; -} - -// Swizzle-based reduction; this will work on Sea Islands -/* -FPTYPE swizzle_reduction(FPTYPE temp_sum) -{ -#ifdef USE_DOUBLE - typedef union dbl_b32 { - double val; - int2 b32; - } dbl_b32_t; - dbl_b32_t upper_sum, t_temp_sum; - - t_temp_sum.val = temp_sum; - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x80b1); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x80b1); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x804e); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x804e); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x101f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x101f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x201f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x201f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x401f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x401f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_readlane(t_temp_sum.b32.x, 32); - upper_sum.b32.y = __builtin_amdgcn_readlane(t_temp_sum.b32.y, 32); - t_temp_sum.val += upper_sum.val; - temp_sum = t_temp_sum.val; -#else // Swizzle-based for SPFP - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x80b1)); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x804e)); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x101f)); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x201f)); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x401f)); - temp_sum += as_float(__builtin_amdgcn_readlane(as_uint(temp_sum), 32)); -#endif // Single or double precision - - return temp_sum; -} - -// Swizzle-based reduction; this will work on Sea Islands -// This version will also put in a max-reduction for row_max_depth behind -// the s_waitcnt instructions, making it faster than two sequential -// reductions back-to-back. -__device__ -FPTYPE swizzle_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth) -{ -#ifdef USE_DOUBLE - typedef union dbl_b32 { - double val; - int2 b32; - } dbl_b32_t; - dbl_b32_t upper_sum, t_temp_sum; - - t_temp_sum.val = temp_sum; - unsigned int max_depth = *row_max_depth; - unsigned int upper_max_depth; - - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1))); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x80b1); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x80b1); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e))); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x804e); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x804e); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f))); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x101f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x101f); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f))); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x201f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x201f); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f))); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x401f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x401f); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32))); - upper_sum.b32.x = __builtin_amdgcn_readlane(t_temp_sum.b32.x, 32); - upper_sum.b32.y = __builtin_amdgcn_readlane(t_temp_sum.b32.y, 32); - t_temp_sum.val += upper_sum.val; - temp_sum = t_temp_sum.val; -#else // Swizzle-based for SPFP - unsigned int max_depth = *row_max_depth; - - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x80b1)); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x804e)); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x101f)); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x201f)); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x401f)); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f))); - temp_sum += as_float(__builtin_amdgcn_readlane(as_uint(temp_sum), 32)); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32))); -#endif // Single or double precision - -#ifndef SYNCFREE_KERNEL - *row_max_depth = max_depth; -#endif - return temp_sum; -} - -// Swizzle-based reduction; this will work on Sea Islands -// This version will also put in a max-reduction for row_max_depth -// add-reduction of the spin-loop counter behind the s_waitcnt -// instructions, making it faster than two sequential reductions -// back-to-back. -__device__ -FPTYPE swizzle_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth, unsigned int *spin_times) -{ - unsigned int max_depth; - unsigned int spins; - -#ifdef USE_DOUBLE - typedef union dbl_b32 { - double val; - int2 b32; - } dbl_b32_t; - dbl_b32_t upper_sum, t_temp_sum; - - t_temp_sum.val = temp_sum; - max_depth = *row_max_depth; - spins = *spin_times; - - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1))); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x80b1); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x80b1); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x80b1); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e))); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x804e); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x804e); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x804e); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f))); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x101f); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x101f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x101f); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f))); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x201f); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x201f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x201f); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f))); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x401f); - upper_sum.b32.x = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.x, 0x401f); - upper_sum.b32.y = __builtin_amdgcn_ds_swizzle(t_temp_sum.b32.y, 0x401f); - t_temp_sum.val += upper_sum.val; - max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32))); - spins += __builtin_amdgcn_readlane(spins, 32); - upper_sum.b32.x = __builtin_amdgcn_readlane(t_temp_sum.b32.x, 32); - upper_sum.b32.y = __builtin_amdgcn_readlane(t_temp_sum.b32.y, 32); - t_temp_sum.val += upper_sum.val; - temp_sum = t_temp_sum.val; - -#else // Swizzle-based for SPFP - max_depth = *row_max_depth; - spins = *spin_times; - - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x80b1)); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x80b1); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x80b1))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x804e)); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x804e); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x804e))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x101f)); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x101f); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x101f))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x201f)); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x201f); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x201f))); - temp_sum += as_float(__builtin_amdgcn_ds_swizzle(as_uint(temp_sum), 0x401f)); - spins += __builtin_amdgcn_ds_swizzle(spins, 0x401f); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_ds_swizzle(max_depth, 0x401f))); - temp_sum += as_float(__builtin_amdgcn_readlane(as_uint(temp_sum), 32)); - spins += __builtin_amdgcn_readlane(spins, 32); - max_depth = max(max_depth, as_uint(__builtin_amdgcn_readlane(max_depth, 32))); -#endif // Single or double precision - - *row_max_depth = max_depth; - *spin_times = spins; - return temp_sum; -} -*/ - -// If we are in GCN3, then use DPP to further increase the performance of -// inter-lane reduction of the temp_sum variable. -__device__ -FPTYPE dpp_reduction(FPTYPE temp_sum) -{ - // If we write the EXEC mask before the DPP op, we need 5 stall cycles. - // So every one of these starts with an s_nop 4 - // We require an s_nop 1 at the end in case the compiler immediately uses - // the last output value. -#ifndef GCN2 -#ifdef USE_DOUBLE - - typedef struct b32_2 { - int x; - int y; - } b32_t; - - typedef union dbl_b32 { - double val; - b32_t b32; - } dbl_b32_t; - dbl_b32_t upper_sum, t_temp_sum; - t_temp_sum.val = temp_sum; - upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x111, 0xf, 0xf, 0); // row_shr:1 - upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x111, 0xf, 0xf, 0); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x112, 0xf, 0xf, 0); // row_shr:2 - upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x112, 0xf, 0xf, 0); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x114, 0xf, 0xe, 0); // row_shr:4 bank_mask:0xe - upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x114, 0xf, 0xe, 0); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x118, 0xf, 0xc, 0); // row_shr:8 bank_mask:0xc - upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x118, 0xf, 0xc, 0); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x142, 0xa, 0xf, 0); // row_bcast:15 row_mask:0xa - upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x142, 0xa, 0xf, 0); - t_temp_sum.val += upper_sum.val; - upper_sum.b32.x = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.x, 0x143, 0xc, 0xf, 0); // row_bcast:31 row_maxk:0xc - upper_sum.b32.y = __builtin_amdgcn_mov_dpp(t_temp_sum.b32.y, 0x143, 0xc, 0xf, 0); - t_temp_sum.val += upper_sum.val; - return t_temp_sum.val; -#else // USE_DOUBLE - __asm__ volatile ("s_nop 4\n" - "v_add_f32 %0 %0 %0 row_shr:1 bound_ctrl:0\n" - "s_nop 1\n" - "v_add_f32 %0 %0 %0 row_shr:2 bound_ctrl:0\n" - "s_nop 1\n" - "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n" - "s_nop 1\n" - "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n" - "s_nop 1\n" - "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n" - "s_nop 1\n" - "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n" - "s_nop 1" - : "=v"(temp_sum) - : "0"(temp_sum)); - return temp_sum; -#endif // Single vs. Double -#else // We're in GCN2, so we will never enter this function - return temp_sum; -#endif -} - -// This version of the DPP reduction function also does a max-reduce on the -// row_max_depth variable. It fits these DPP functions into one of the NOP -// slots required by the DPP instructions, so it should be fast. -__device__ -FPTYPE dpp_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth) -{ - // If we write the EXEC mask before the DPP op, we need 5 stall cycles. - // So every one of these starts with an s_nop 4 - // We require an s_nop 1 at the end in case the compiler immediately uses - // the last output value. - unsigned int temp_max; -#ifdef USE_DOUBLE - typedef struct b32_2 { - int x; - int y; - } b32_t; - - typedef union dbl_b32 { - double val; - b32_t b32; - } dbl_b32_t; - dbl_b32_t upper_sum, t_temp_sum; - temp_max = *row_max_depth; - t_temp_sum.val = temp_sum; - __asm__ volatile ("s_nop 4\n" - "v_mov_b32 %0 %4 row_shr:1 bound_ctrl:0\n" - "v_mov_b32 %1 %5 row_shr:1 bound_ctrl:0\n" - "v_max_u32 %2 %2 %2 row_shr:1 bound_ctrl:0\n" - "s_nop 0\n" - "v_add_f64 %3 %7 %8\n" - "v_mov_b32 %0 %4 row_shr:2 bound_ctrl:0\n" - "v_mov_b32 %1 %5 row_shr:2 bound_ctrl:0\n" - "v_max_u32 %2 %2 %2 row_shr:2 bound_ctrl:0\n" - "s_nop 0\n" - "v_add_f64 %3 %7 %8\n" - "v_mov_b32 %0 %4 row_shr:4 bank_mask:0xe\n" - "v_mov_b32 %1 %5 row_shr:4 bank_mask:0xe\n" - "v_max_u32 %2 %2 %2 row_shr:4 bank_mask:0xe\n" - "s_nop 0\n" - "v_add_f64 %3 %7 %8\n" - "v_mov_b32 %0 %4 row_shr:8 bank_mask:0xc\n" - "v_mov_b32 %1 %5 row_shr:8 bank_mask:0xc\n" - "v_max_u32 %2 %2 %2 row_shr:8 bank_mask:0xc\n" - "s_nop 0\n" - "v_add_f64 %3 %7 %8\n" - "v_mov_b32 %0 %4 row_bcast:15 bank_mask:0xa\n" - "v_mov_b32 %1 %5 row_bcast:15 bank_mask:0xa\n" - "v_max_u32 %2 %2 %2 row_bcast:15 bank_mask:0xa\n" - "s_nop 0\n" - "v_add_f64 %3 %7 %8\n" - "v_mov_b32 %0 %4 row_bcast:31 row_mask:0xc\n" - "v_mov_b32 %1 %5 row_bcast:31 bank_mask:0xc\n" - "v_max_u32 %2 %2 %2 row_bcast:31 bank_mask:0xc\n" - "s_nop 0\n" - "v_add_f64 %3 %7 %8\n" - : "={v2}"(upper_sum.b32.x), "={v3}"(upper_sum.b32.y), "=v"(temp_max), "=v"(t_temp_sum.val) - : "v"(t_temp_sum.b32.x), "v"(t_temp_sum.b32.y), "2"(temp_max), "3"(t_temp_sum.val), "{v[2:3]}"(upper_sum.val)); - *row_max_depth = temp_max; - return t_temp_sum.val; -#else - temp_max = *row_max_depth; - __asm__ volatile ("s_nop 4\n" - "v_add_f32 %0 %0 %0 row_shr:1 bound_ctrl:0\n" - "v_max_u32 %1 %1 %1 row_shr:1 bound_ctrl:0\n" - "s_nop 0\n" - "v_add_f32 %0 %0 %0 row_shr:2 bound_ctrl:0\n" - "v_max_u32 %1 %1 %1 row_shr:2 bound_ctrl:0\n" - "s_nop 0\n" - "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n" - "v_max_u32 %1 %1 %1 row_shr:4 bank_mask:0xe\n" - "s_nop 0\n" - "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n" - "v_max_u32 %1 %1 %1 row_shr:8 bank_mask:0xc\n" - "s_nop 0\n" - "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n" - "v_max_u32 %1 %1 %1 row_bcast:15 row_mask:0xa\n" - "s_nop 0\n" - "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n" - "v_max_u32 %1 %1 %1 row_bcast:31 row_mask:0xc\n" - "s_nop 1\n" - : "=v"(temp_sum), "=v"(temp_max) - : "0"(temp_sum), "1"(temp_max)); - *row_max_depth = temp_max; - return temp_sum; -#endif // Single vs. Double -} - -// This version of the DPP reduction function also does a max-reduce on the -// row_max_depth variable and max-add on the total spin variable. -// It fits these DPP functions into NOP slots required by the DPP -// instructions, so it should be fast. -__device__ -FPTYPE dpp_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth, unsigned int *spin_times) -{ - // If we write the EXEC mask before the DPP op, we need 5 stall cycles. - // So every one of these starts with an s_nop 4 - // We require an s_nop 1 at the end in case the compiler immediately uses - // the last output value. - unsigned int temp_max = *row_max_depth; - unsigned int temp_spin = *spin_times; -#ifdef USE_DOUBLE - typedef struct b32_2 { - int x; - int y; - } b32_t; - - typedef union dbl_b32 { - double val; - b32_t b32; - } dbl_b32_t; - dbl_b32_t upper_sum, t_temp_sum; - temp_max = *row_max_depth; - t_temp_sum.val = temp_sum; - __asm__ volatile ("s_nop 4\n" - "v_mov_b32 %0 %5 row_shr:1 bound_ctrl:0\n" - "v_mov_b32 %1 %6 row_shr:1 bound_ctrl:0\n" - "v_max_u32 %2 %2 %2 row_shr:1 bound_ctrl:0\n" - "v_add_u32 %3 " VCC " %3 %3 row_shr:1 bound_ctrl:0\n" - "v_add_f64 %4 %9 %10\n" - "v_mov_b32 %0 %5 row_shr:2 bound_ctrl:0\n" - "v_mov_b32 %1 %6 row_shr:2 bound_ctrl:0\n" - "v_max_u32 %2 %2 %2 row_shr:2 bound_ctrl:0\n" - "v_add_u32 %3 " VCC " %3 %3 row_shr:2 bound_ctrl:0\n" - "v_add_f64 %4 %9 %10\n" - "v_mov_b32 %0 %5 row_shr:4 bank_mask:0xe\n" - "v_mov_b32 %1 %6 row_shr:4 bank_mask:0xe\n" - "v_max_u32 %2 %2 %2 row_shr:4 bank_mask:0xe\n" - "v_add_u32 %3 " VCC " %3 %3 row_shr:4 bank_mask:0xe\n" - "v_add_f64 %4 %9 %10\n" - "v_mov_b32 %0 %5 row_shr:8 bank_mask:0xc\n" - "v_mov_b32 %1 %6 row_shr:8 bank_mask:0xc\n" - "v_max_u32 %2 %2 %2 row_shr:8 bank_mask:0xc\n" - "v_add_u32 %3 " VCC " %3 %3 row_shr:8 bank_mask:0xc\n" - "v_add_f64 %4 %9 %10\n" - "v_mov_b32 %0 %5 row_bcast:15 row_mask:0xa\n" - "v_mov_b32 %1 %6 row_bcast:15 row_mask:0xa\n" - "v_max_u32 %2 %2 %2 row_bcast:15 row_mask:0xa\n" - "v_add_u32 %3 " VCC " %3 %3 row_bcast:15 row_mask:0xa\n" - "v_add_f64 %4 %9 %10\n" - "v_mov_b32 %0 %5 row_bcast:31 row_mask:0xc\n" - "v_mov_b32 %1 %6 row_bcast:31 row_mask:0xc\n" - "v_max_u32 %2 %2 %2 row_bcast:31 row_mask:0xc\n" - "v_add_u32 %3 " VCC " %3 %3 row_bcast:31 row_mask:0xc\n" - "v_add_f64 %4 %9 %10\n" - "s_nop 0\n" - : "={v2}"(upper_sum.b32.x), "={v3}"(upper_sum.b32.y), "=v"(temp_max), "=v"(temp_spin), "=v"(t_temp_sum.val) - : "v"(t_temp_sum.b32.x), "v"(t_temp_sum.b32.y), "2"(temp_max), "3"(temp_spin), "4"(t_temp_sum.val), "{v[2:3]}"(upper_sum.val)); - *row_max_depth = temp_max; - *spin_times = temp_spin; - return t_temp_sum.val; -#else - __asm__ volatile ("s_nop 4\n" - "v_add_f32 %0 %0 %0 row_shr:1 bound_ctrl:0\n" - "v_max_u32 %1 %1 %1 row_shr:1 bound_ctrl:0\n" - "v_add_u32 %2 " VCC " %2 %2 row_shr:1 bound_ctrl:0\n" - "v_add_f32 %0 %0 %0 row_shr:2 bound_ctrl:0\n" - "v_max_u32 %1 %1 %1 row_shr:2 bound_ctrl:0\n" - "v_add_u32 %2 " VCC " %2 %2 row_shr:2 bound_ctrl:0\n" - "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n" - "v_max_u32 %1 %1 %1 row_shr:4 bank_mask:0xe\n" - "v_add_u32 %2 " VCC " %2 %2 row_shr:4 bank_mask:0xe\n" - "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n" - "v_max_u32 %1 %1 %1 row_shr:8 bank_mask:0xc\n" - "v_add_u32 %2 " VCC " %2 %2 row_shr:8 bank_mask:0xc\n" - "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n" - "v_max_u32 %1 %1 %1 row_bcast:15 row_mask:0xa\n" - "v_add_u32 %2 " VCC " %2 %2 row_bcast:15\n" - "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n" - "v_max_u32 %1 %1 %1 row_bcast:31 row_mask:0xc\n" - "v_add_u32 %2 " VCC " %2 %2 row_bcast:31\n" - "s_nop 1" - : "=v"(temp_sum), "=v"(temp_max), "=v"(temp_spin) - : "0"(temp_sum), "1"(temp_max), "2"(temp_spin)); - *row_max_depth = temp_max; - *spin_times = temp_spin; - return temp_sum; -#endif // Single vs. Double -} - -// Possible reduction techniques: -//#define LDS_REDUCTION -//#define BPERMUTE_REDUCTION -//#define SWIZZLE_REDUCTION - -//#define DPP_REDUCTION - -#if defined(GCN2) && defined(DPP_REDUCTION) -#define SWIZZLE_REDUCTION -#undef DPP_REDUCTION -#endif - -#ifdef DPP_REDUCTION - #define OUTPUT_THREAD WF_SIZE-1 -#else - #define OUTPUT_THREAD 0 -#endif - -__device__ -inline FPTYPE cross_lane_reduction(FPTYPE temp_sum, __shared__ FPTYPE *lds_ptr, - unsigned int start_of_this_row, unsigned int end_of_this_row, - unsigned int wg_lid) -{ -#ifdef LDS_REDUCTION - FPTYPE temp_val = lds_reduction(temp_sum, lds_ptr, start_of_this_row, - end_of_this_row, wg_lid); - return temp_val; -#endif - -#ifdef BPERMUTE_REDUCTION - return bpermute_reduction(temp_sum, start_of_this_row, end_of_this_row, - wg_lid); -#endif - -#ifdef SWIZZLE_REDUCTION - return swizzle_reduction(temp_sum); -#endif - -#ifdef DPP_REDUCTION - return dpp_reduction(temp_sum); -#endif -} - -__device__ -inline FPTYPE cross_lane_reduction_two(FPTYPE temp_sum, unsigned int *row_max_depth, - __shared__ FPTYPE *lds_ptr, __shared__ unsigned int *max_depth_ptr, - unsigned int start_of_this_row, unsigned int end_of_this_row, - unsigned int wg_lid) -{ -#ifdef LDS_REDUCTION - FPTYPE temp_val = lds_reduction_two(temp_sum, *row_max_depth, lds_ptr, - max_depth_ptr, start_of_this_row, end_of_this_row, wg_lid); - *row_max_depth = max_depth_ptr[wg_lid & (~(WF_SIZE-1))]; - return temp_val; -#endif - -#ifdef BPERMUTE_REDUCTION - return bpermute_reduction_two(temp_sum, row_max_depth, start_of_this_row, - end_of_this_row, wg_lid); -#endif - -#ifdef SWIZZLE_REDUCTION - return swizzle_reduction_two(temp_sum, row_max_depth); -#endif - -#ifdef DPP_REDUCTION - return dpp_reduction_two(temp_sum, row_max_depth); -#endif -} - -__device__ -inline FPTYPE cross_lane_reduction_three(FPTYPE temp_sum, unsigned int *row_max_depth, - unsigned int *spin_times, __shared__ FPTYPE *lds_ptr, - __shared__ unsigned int *max_depth_ptr, __shared__ unsigned int *total_spins_ptr, - unsigned int start_of_this_row, unsigned int end_of_this_row, - unsigned int wg_lid) - -{ - -#ifdef LDS_REDUCTION - FPTYPE temp_val = lds_reduction_three(temp_sum, *row_max_depth, *spin_times, - lds_ptr, max_depth_ptr, total_spins_ptr, start_of_this_row, - end_of_this_row, wg_lid); - *row_max_depth = max_depth_ptr[wg_lid & (~(WF_SIZE-1))]; - *spin_times = total_spins_ptr[wg_lid & (~(WF_SIZE-1))]; - return temp_val; -#endif - -#ifdef BPERMUTE_REDUCTION - return bpermute_reduction_three(temp_sum, row_max_depth, spin_times, - start_of_this_row, end_of_this_row, wg_lid); -#endif - -#ifdef SWIZZLE_REDUCTION - return swizzle_reduction_three(temp_sum, row_max_depth, spin_times); -#endif - -#ifdef DPP_REDUCTION - return dpp_reduction_three(temp_sum, row_max_depth, spin_times); -#endif - - return temp_sum; -} - -// The option below will, in the analyze and syncfree kernels, attempt to -// spin-loop on flags in the LDS for rows that are being solved by wavefronts -// earlier in the same workgroup. This should relieve global memory pressure. -// We found that, with careful control of branching for this logic, this yields -// an average of 20% better performance than global spin-looping. -#define USE_LDS_SPINLOOP - -// The option below will, in the levelsync kernel, attempt to spin-loop on -// flags in the LDS for rows that are being solved for wavefronts earlier in -// the same workgroup. This is beneficial if levels have very few rows in them, -// as workgroups are likely to have multiple levels and thus require spinning. -// However, knowing what rows are in the LDS entry is more difficult for the -// levelsync kernel, because it depends entirely on the rowMap entries being -// used by these waves. As such, this loses performance when walking the row -// map outweights the spin-loop benefits. As of this writing, the levelsync -// LDS spin-loop is a net loser. -// Leaving this around for future studies. -// #define USE_LDS_SPINLOOP_LEVELSYNC - - -// Solves for 'y' in the equation 'A * y = alpha * x' -// In this kernel, we do not know what level each row is in. As such, we must -// dynamically figure this out. Each row has the potential to require data from -// a previous row. This happens when it has a non-zero in a column. -// i.e. having a non-zero value in column $foo means you must wait for row $foo -// to finish. -// -// The 'doneArray' has one entry per row. It starts out with each entry containing -// zeroes. When a row finishes and its output written, it knows its own level -// (which must be 1 more than the highest level of any row it relied on). As such, -// it puts that level into the doneArray. If you must wait on a previous row, you -// spinloop on that row's doneArray entry. Once it's non-zero, you know both that -// the data is ready, as well as what level that value came from (so you can -// calculate your own level). -// -// The doneArray can be used for future iterations, since the parllelism doesn't -// change between iterations. As such, we keep the doneArray around and call -// a different kernel that doesn't do the spin-loop waiting. To prep for that -// kernel, we also need to know how many rows are at each level. Thus, when a -// row finishes, it increments the numRowsAtLevel[] entry associated with its -// level. Also we set the maxDepth variable to the maximum of any level seen. -//__attribute__((reqd_work_group_size(WF_SIZE*WF_PER_WG, 1, 1))) -//__kernel void -__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1) -amd_spts_analyze_and_solve( - const size_t global_work_size, -#ifdef USE_ROCSHMEM - const int this_pe, - const int total_pes, - unsigned int * __restrict__ shadowDoneArray, - unsigned int * __restrict__ reqUpdateArray, - unsigned int * __restrict__ remoteInProgressArray, - unsigned int * __restrict__ oneBuf, - // 0: Naive puts - // 1: Naive gets - // 2: blocked puts - // 3: put/get hybrid - int rocshmem_algorithm, - int rocshmem_put_block_size, - int rocshmem_get_backoff_factor, - int spts_block_size, -#endif - const FPTYPE * __restrict__ vals, - const int * __restrict__ cols, - const int * __restrict__ rowPtrs, - const FPTYPE * __restrict__ vec_x, - FPTYPE * __restrict__ out_y, - const FPTYPE alpha, - unsigned int * __restrict__ doneArray, - unsigned int * __restrict__ numRowsAtLevel, - unsigned int * __restrict__ maxDepth, - unsigned long long * __restrict__ totalSpin) -{ - __shared__ FPTYPE *lds_ptr; - lds_ptr = nullptr; - __shared__ unsigned int *max_depth_ptr; - max_depth_ptr = nullptr; - __shared__ unsigned int *total_spins_ptr; - total_spins_ptr = nullptr; -#ifdef LDS_REDUCTION - __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG]; - lds_ptr = lds; -#endif - - // If we want future kernel iterations to skip the "wait on previous rows" - // work, we need to know what level set this row is in. This array is used - // to calculate the depth of each dependency so we can calculate max+1. -#ifdef LDS_REDUCTION - __shared__ unsigned int max_depth[WF_SIZE*WF_PER_WG]; - max_depth_ptr = max_depth; - __shared__ unsigned int total_spins[WF_SIZE*WF_PER_WG]; - total_spins_ptr = total_spins; -#endif // LDS_REDUCTION - unsigned int row_max_depth = 0; - unsigned int spin_times = 0; - const unsigned int wg_lid = hipThreadIdx_x; - const unsigned int lid = wg_lid % WF_SIZE; - -#ifdef USE_ROCSHMEM - __shared__ rocshmem_ctx_t ctx; - - - //if (wg_lid == OUTPUT_THREAD) { - rocshmem_wg_init(); - rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx); - __syncthreads(); -#endif - - // Which wavefront within this workgroup - // also means which row within this workgroup's group of rows - const unsigned int local_offset = wg_lid / WF_SIZE; - // First row within this workgroup (within this group of rows) - const unsigned int local_first_row = hipBlockIdx_x * WF_PER_WG; - // Actual row this wavefront will work on. - const unsigned int local_row = local_first_row + local_offset; - -#ifdef USE_ROCSHMEM - // Get the global row for this wavefront assuming a row-cyclic - // decomposition. Basically we need to account for other PEs here. - int local_block_id = local_row / spts_block_size; - const unsigned int block_offset = (local_block_id * spts_block_size * total_pes) + - (this_pe * spts_block_size); - const unsigned int row = block_offset + (local_row % spts_block_size); - const unsigned int first_row = block_offset + (local_first_row % spts_block_size); -#else - const unsigned int row = local_row; - const unsigned int first_row = local_first_row; -#endif - - __shared__ FPTYPE diagonal[WF_PER_WG]; - -#ifdef USE_LDS_SPINLOOP - // If we are trying to access an output that was produced by a wavefront - // earlier in this workgroup, perform the transfer and spin-loop in LDS - // to relieve global memory pressure. - __shared__ unsigned int localDoneArray[WF_PER_WG]; - __shared__ FPTYPE localOutY[WF_PER_WG]; - __syncthreads(); - - if (global_work_size > (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)) { - - if (lid == 0) - { - localDoneArray[local_offset] = 0; - localOutY[local_offset] = 0.; - } -#else - if (global_work_size > (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)) { -#endif - - FPTYPE temp_sum = 0.; - // Preload the first thread with alpha * x. We can bring this forward - // because the 'x' vector in A*y=alpha*x is fixed and known already. - // From this point on, we will subtract out values from rows of X from - // alpha*x, and that will allow us to solve for entries of y. - // Hauling this up to the top of the kernel increases performance because - // it removes the memory load and multiply from the critical path of - // "previous rows' inputs are ready, finish this and allow further rows - // to start up as fast as possible." - if (lid == OUTPUT_THREAD) - temp_sum = alpha * vec_x[row]; - - unsigned int start_of_this_row = rowPtrs[row]; - unsigned int end_of_this_row = rowPtrs[row+1]; - unsigned int start_point = start_of_this_row+lid; - - - // This wavefront operates on a single row, from its beginning to end. - for(unsigned int j = start_point; j < end_of_this_row; j+=WF_SIZE) - { - - FPTYPE out_val; - unsigned int local_done = 0; - // Replace the two loads below with inline assembly that sets the - // SLC bit. This forces the loads to essentially bypass the L2 - // to increase cache hit rate on other instructions. Vals and cols - // are basically streamed in, so caching them doesn't help much. - - // local_col will tell us, for this iteration of the above for loop - // (i.e. for this entry in this row), which columns contain the - // non-zero values. We must then ensure that the output from the row - // associated with the local_col is complete to ensure that we can - // calculate the right answer. - int local_col = __builtin_nontemporal_load(&cols[j]); - // Haul loading from vals[] up near the load of cols[] so that we get - // good coalsced loads. - FPTYPE local_val = __builtin_nontemporal_load(&vals[j]); - - // diagonal. Skip this, we need to solve for it. - if (local_col == row) - { - local_done = 1; - diagonal[local_offset] = local_val; - local_val = 0.; // Make the out_val multiply below do nothing. - } - - // While there are threads in this workgroup that have been unable to - // get their input, loop and wait for the flag to exist. - __asm__ volatile ("s_setprio 0"); -#ifdef USE_ROCSHMEM - int target_pe = (local_col / spts_block_size) % total_pes; - int backoff_counter = 0; - bool need_remote_notify = true; - bool need_comm = true; - bool first_time = true; - -#endif - -#ifdef USE_LDS_SPINLOOP - if (local_col >= first_row) - { - while (!local_done) - { - // Check in the LDS if the value was produced by someone - // within this workgroup. - local_done = localDoneArray[local_col - first_row]; - out_val = localOutY[local_col - first_row]; - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - } - } -#endif // USE_LDS_SPINLOOP - while (!local_done) - { - // Replace this atomic with an assembly load with GLC bit set. - // This forces the load to go to the coherence point, allowing - // us to avoid deadlocks. - // local_done = atomic_get_done(doneArray, local_col); - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n" - "s_waitcnt vmcnt(0)" - : "=v"(local_done) - : "v"(&doneArray[local_col])); - - spin_times++; - -#ifdef USE_ROCSHMEM - if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 1)) { - if (first_time) { - if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0) - need_comm = false; - } - first_time = false; - if (need_comm) - { - for (int i = 0; i < (backoff_counter * rocshmem_get_backoff_factor); i++) - __asm__ volatile("s_sleep 127"); - - - rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe); - //rocshmem_ctx_quiet(ctx); - - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n" - "s_waitcnt vmcnt(0)" - : "=v"(local_done) - : "v"(&shadowDoneArray[local_col])); - - - if (local_done) - { - rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe); - - __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP - : - : "v"(&doneArray[local_col]), - "v"(local_done)); - } else { - backoff_counter++; - - } - - } - } - - if ((total_pes > 1) && (target_pe != this_pe) && (rocshmem_algorithm == 3)) { - if (need_remote_notify) { - need_remote_notify = false; - //if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) != 0) - //if (atomicCAS(&remoteInProgressArray[local_col], 0, 1) == 0) - { - rocshmem_ctx_putmem_nbi(ctx, &reqUpdateArray[local_col], oneBuf, sizeof(int), target_pe); - //printf("Put 111 blockIDx %d threadID %d target_pe %d local_col %d oneBuf[0]= %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, oneBuf[0]); - - rocshmem_ctx_fence(ctx); - //printf("fence 222 blockIDx %d threadID %d target_pe %d local_col %d \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col); - rocshmem_ctx_getmem_nbi(ctx, &shadowDoneArray[local_col], &doneArray[local_col], sizeof(int), target_pe); - rocshmem_ctx_quiet(ctx); - //printf("Get 333 blockIDx %d threadID %d target_pe %d local_col %d shadowDone %d \n \n", hipBlockIdx_x, hipThreadIdx_x, target_pe, local_col, shadowDoneArray[local_col]); - - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\n" - "s_waitcnt vmcnt(0)" - : "=v"(local_done) - : "v"(&shadowDoneArray[local_col])); - - if (local_done) - { - rocshmem_ctx_getmem_nbi(ctx, &out_y[local_col], &out_y[local_col], sizeof(FPTYPE), target_pe); - rocshmem_ctx_quiet(ctx); - __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP - : - : "v"(&doneArray[local_col]), - "v"(local_done)); - } - } - } - } -#endif - } - - __asm__ volatile ("s_setprio 1"); -#ifdef USE_LDS_SPINLOOP - if (local_col < first_row) -#endif - { - // The command below is manually replaced with GCN assembly with - // the GLC bit set. This bypasses the L1, allowing us to do a - // coherent load of the variable without needing atomics. -#ifdef USE_DOUBLE - // out_val = as_double(atom_or((__global ulong *)&(out_y[local_col]), 0)); - __asm__ volatile (MEM_PREFIX"_load_dwordx2 %0 %1 " OFF_MODIFIER " glc\n" - "s_waitcnt vmcnt(0)" - : "=v"(out_val) - : "v"(&out_y[local_col])); -#else - // out_val = as_float(atomic_or((__global uint *)&(out_y[local_col]), 0)); - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc\n" - "s_waitcnt vmcnt(0)" - : "=v"(out_val) - : "v"(&out_y[local_col])); -#endif - } - temp_sum -= local_val * out_val; - - row_max_depth = max(local_done, row_max_depth); - } - __asm__ volatile ("s_setprio 1"); - - // And if we care about the maximum depth, add it into OUTPUT_THREAD's - // entry within the max_depth array. - temp_sum = cross_lane_reduction_three(temp_sum, &row_max_depth, &spin_times, - lds_ptr, max_depth_ptr, total_spins_ptr, start_of_this_row, - end_of_this_row, wg_lid); - row_max_depth++; - - // y = (x-sum_of_vals_from_A) / diag - if (lid == OUTPUT_THREAD) - { -#ifndef LDS_REDUCTION - // Wait for local memory to quiesce for the diagonal - // LDS_REDUCTION has such waits in it already. - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); -#endif - FPTYPE out_val = temp_sum / diagonal[local_offset]; - //out_y[row] = out_val; - -#ifdef USE_DOUBLE - __asm__ volatile (MEM_PREFIX"_store_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val)); -#else - __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val)); -#endif - - //out_y[row] = temp_sum / diagonal[local_offset]; // original divide -#ifdef USE_LDS_SPINLOOP - localOutY[row - first_row] = out_val; - localDoneArray[row - first_row] = row_max_depth; -#endif // USE_LDS_SPINLOOP - //doneArray[row] = row_max_depth; - __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(row_max_depth)); - asm volatile ("s_waitcnt vmcnt(0)\n\t"); - -#ifdef USE_ROCSHMEM - if (rocshmem_algorithm == 2 && total_pes > 1) { - int CHUNK = rocshmem_put_block_size; - bool sendTime = true; - int row_base = (row / CHUNK) * CHUNK; - int num_done = atomicAdd(&shadowDoneArray[row_base], 1); - sendTime = (num_done == (CHUNK - 1)); - for(int p=0; p= first_row) - { - // Check in the LDS if the value was produced by someone - // within this workgroup. - local_done = localDoneArray[local_col - first_row]; - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - } - else -#endif // USE_LDS_SPINLOOP - { - // Replace this atomic with an assembly load with GLC bit set. - // This forces the load to go to the coherence point, allowing - // us to avoid deadlocks. - // local_done = atomic_get_done(doneArray, local_col); - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\ns_waitcnt vmcnt(0)" : "=v"(local_done) : "v"(&doneArray[local_col])); - } - if (local_done) - { - FPTYPE out_val; - __asm__ volatile ("s_setprio 1"); -#ifdef USE_LDS_SPINLOOP - if (local_col >= first_row) - { - out_val = localOutY[local_col - first_row]; - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); - } - else -#endif // USE_LDS_SPINLOOP - { - // The command below is manually replaced with GCN assembly with - // the GLC bit set. This bypasses the L1, allowing us to do a - // coherent load of the variable without needing atomics. -#ifdef USE_DOUBLE - // out_val = as_double(atom_or((__global ulong *)&(out_y[local_col]), 0)); - __asm__ volatile (MEM_PREFIX"_load_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col])); -#else - // out_val = as_float(atomic_or((__global uint *)&(out_y[local_col]), 0)); - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col])); -#endif - } - temp_sum -= local_val * out_val; - - } - else - { - (void)0; - } - } - } - __asm__ volatile ("s_setprio 1"); - - // Take all of the temp_sum values and add them together into - // OUTPUT_THREAD's temp_sum value. - temp_sum = cross_lane_reduction(temp_sum, lds_ptr, start_of_this_row, - end_of_this_row, wg_lid); - - // y = (x-sum_of_vals_from_A) / diag - if (lid == OUTPUT_THREAD) - { -#ifndef LDS_REDUCTION - // Wait for local memory to quiesce for the diagonal - // LDS_REDUCTION has such waits in it already. - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); -#endif - FPTYPE out_val = temp_sum / diagonal[local_offset]; - //out_y[row] = out_val; -#ifdef USE_DOUBLE - __asm__ volatile (MEM_PREFIX"_store_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val)); -#else - __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val)); -#endif - //out_y[row] = temp_sum / diagonal[local_offset]; // original divide - int set_one = 1; -#ifdef USE_LDS_SPINLOOP - localDoneArray[row - first_row] = 1; - localOutY[row - first_row] = out_val; -#endif // USE_LDS_SPINLOOP - //doneArray[row] = 1; - __asm__ volatile (MEM_PREFIX"_store_byte %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(set_one)); - // If you add this back in after doing a native_divide up above, - // we can get *some* of the accuracy of a full Newton-Raphson - // divide while maintaining the performance of the - // native_divide() on the critical path. - //out_y[row] = temp_sum / diagonal[local_offset]; - } -} - -// Solves for 'y' in the equation 'A * y = alpha * x' -// In this kernel, every row is in the same level. As such, we can freely -// have every workgrup complete at its own pace. -// However, we must call this kernel multiple times, once per level. -// -// The rowMap tells us that, in this level, gid X works on row Y. -// We need this because each level of the solve can have different numbers -// of non-contiguous row. This version of our solver uses one kernel call -// per level. -// -// In addition, the 'total_rows_in_prev_levels' tells us how far in that array -// to look. -__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1) -amd_spts_levelset_solve( - size_t global_work_size, - const FPTYPE * __restrict__ vals, - const int * __restrict__ cols, - const int * __restrict__ rowPtrs, - const FPTYPE * __restrict__ vec_x, - FPTYPE * __restrict__ out_y, - const unsigned int * __restrict__ rowMap, - const unsigned int total_rows_in_prev_levels, - const FPTYPE alpha) -{ - if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return; - __shared__ FPTYPE *lds_ptr; - lds_ptr = nullptr; -#ifdef LDS_REDUCTION - __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG]; - lds_ptr = lds; -#endif - - // Which wavefront within this workgroup - // also means which row within this workgroup's group of rows - const unsigned int local_offset = hipThreadIdx_x / WF_SIZE; - // First row within this workgroup (within this group of rows) - const unsigned int first_row = hipBlockIdx_x * WF_PER_WG; - - const unsigned int wg_lid = hipThreadIdx_x; - const unsigned int lid = wg_lid % WF_SIZE; - - const unsigned int row = rowMap[total_rows_in_prev_levels+first_row+local_offset]; - - __shared__ FPTYPE diagonal[WF_PER_WG]; - FPTYPE temp_sum = 0.; - - // Preload the first thread with alpha * x. We can bring this forward - // because the 'x' vector in A*y=alpha*x is fixed and known already. - // From this point on, we will subtract out values from rows of X from - // alpha*x, and that will allow us to solve for entries of y. - if (lid == OUTPUT_THREAD) - temp_sum = alpha * vec_x[row]; - - unsigned int start_of_this_row = rowPtrs[row]; - unsigned int end_of_this_row = rowPtrs[row+1]; - unsigned int start_point = start_of_this_row+lid; - - // This workgroup operates on a single row, from its beginning to end. - for(unsigned int j = start_point; j < end_of_this_row; j+=WF_SIZE) - { - // local_col will tell us, for this iteration of the above for loop - // (i.e. for this entry in this row), which columns contain the - // non-zero values. We must then ensure that the output from the row - // associated with the local_col is complete to ensure that we can - // calculate the right answer. - int local_col = -1; - // Haul loading from vals[] up near the load of cols[] so that we get - // good coalsced loads. - FPTYPE local_val = 0.; - - // Replace the two loads below with inline assembly that sets the - // SLC bit. This forces the loads to essentially bypass the L2 - // to increase cache hit rate on other instructions. Vals and cols - // are basically streamed in, so caching them doesn't help much. - // local_col = cols[j]; - // local_val = vals[j]; -#ifdef USE_DOUBLE - __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dwordx2 %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j])); -#else - __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dword %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j])); -#endif - - // diagonal. Skip this, we need to solve for it. - if (local_col == row) - diagonal[local_offset] = local_val; - else - { - FPTYPE out_val = out_y[local_col]; - temp_sum -= local_val * out_val; - } - } - // Take all of the temp_sum values and add them together into - // OUTPUT_THREAD's temp_sum value. - temp_sum = cross_lane_reduction(temp_sum, lds_ptr, - start_of_this_row, end_of_this_row, wg_lid); - - // y = (x-sum_of_vals_from_A) / diag - if (lid == OUTPUT_THREAD) - { -#ifndef LDS_REDUCTION - // Wait for local memory to quiesce for the diagonal - // LDS_REDUCTION has such waits in it already. - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); -#endif - out_y[row] = temp_sum / diagonal[local_offset]; // original divide - //out_y[row] = temp_sum / diagonal[local_offset]; // original divide - } -} - -// Solves for 'y' in the equation 'A * y = alpha * x' -// This kernel will only work if we launch a single workgroup that will -// solve multiple levels in a serial fashion. For each level, every thread -// within that level will try to solve for a different row. -// After solving for this level, the single workgroup hits a workgroup-wide -// barrier instruction waiting for all the other rows in this level to -// complete. -// -// We can only solve up to 1024 rows in a single level call right now, -// because each thread will solve a single row per level. -// -// This is a "CSR-Scalar" style analysis, where each thread is accessing -// a potentially very different area of both the CSR matrix and the vector. -// Performance may be bad, but this is very easy to write. -// -// The rowMap tells us that, within a level, thread X works on row Y. -// We need this because each level of the solve can have different numbers -// of non-contiguous row. -// In addition, the 'total_rows_in_prev_levels' tells us how far in that array -// to look. -// -// [start_level, end_level) tell us which entries in the rowMap we will go -// through in this kernel invocation. -__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1) -amd_spts_scalar_solve( - size_t global_work_size, - const FPTYPE * __restrict__ vals, - const int * __restrict__ cols, - const int * __restrict__ rowPtrs, - const FPTYPE * __restrict__ vec_x, - FPTYPE * __restrict__ out_y, - const FPTYPE alpha, - const unsigned int * __restrict__ rowMap, - const unsigned int * __restrict__ totalRowsInEachLevel, - const unsigned int total_rows_in_prev_levels, - const unsigned int start_level, - const unsigned int end_level) -{ - if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return; - const unsigned int gid = hipBlockIdx_x; - const unsigned int wg_lid = hipThreadIdx_x; - const unsigned int lid = wg_lid % WF_SIZE; - - __shared__ unsigned int total_rows_seen_so_far; - if (wg_lid == 0) - total_rows_seen_so_far = 0; - - // We have a single workgroup, and it is going to walk through a - // contiguous set of "levels" in the dependency graph. - for (unsigned int current_level = start_level; current_level < end_level; current_level++) - { - // Every time we reach a new level, all of the threads within - // this workgroup need to have completed their row's work. - // This guarantees that we have synchronized. - __syncthreads(); - if (wg_lid < totalRowsInEachLevel[current_level]) - { - const unsigned int entry_in_row_map = total_rows_in_prev_levels + total_rows_seen_so_far + wg_lid; - const unsigned int row = rowMap[entry_in_row_map]; - FPTYPE diagonal = 0.; - FPTYPE temp_sum = alpha * vec_x[row]; - - unsigned int start_of_this_row = rowPtrs[row]; - unsigned int end_of_this_row = rowPtrs[row+1]; - - // This thread operates on a single row, from its beginning to end. - for(unsigned int j = start_of_this_row; j < end_of_this_row; j++) - { - // local_col will tell us, for this iteration of the above for loop - // (i.e. for this entry in this row), which columns contain the - // non-zero values. We must then ensure that the output from the row - // associated with the local_col is complete to ensure that we can - // calculate the right answer. - int local_col = cols[j]; - // Haul loading from vals[] up near the load of cols[] so that we get - // good coalsced loads. - FPTYPE local_val = vals[j]; - - // diagonal. Skip this, we need to solve for it. - if (local_col == row) - diagonal = local_val; - else - { - FPTYPE out_val; -#ifdef USE_DOUBLE - out_val = __ull2double_rd(atomicOr((unsigned long long *)&(out_y[local_col]), 0)); -#else - out_val = as_float(atomicOr((uint *)&(out_y[local_col]), 0)); -#endif - temp_sum -= local_val * out_val; - } - } - - FPTYPE out_val = temp_sum / diagonal; - //FPTYPE out_val = temp_sum / diagonal; // original divide - out_y[row] = out_val; - } - if (wg_lid == 0) - total_rows_seen_so_far += totalRowsInEachLevel[current_level]; - } -} - -// Solves for 'y' in the equation 'A * y = alpha * x' -// This kernel will only work if we launch a single workgroup that will -// solve multiple levels in a serial fashion. For each level, every wavefront -// within that level will try to solve for a different row. -// After solving for this level, the single workgroup hits a workgroup-wide -// barrier instruction waiting for all the other rows in this level to -// complete. -// -// Within a level, this algorithm will loop through the rows, so we should -// be able to handle levels of any size -- no synchronization is needed -// between the wavefronts working on a single level, since those rows are -// independent of one another. -// -// This is a "CSR-Vector" style execution, where each wavefront accesses -// coalesced values within its row, but where short rows waste thread -// resources. -// -// The rowMap tells us that, within a level, thread X works on row Y. -// We need this because each level of the solve can have different numbers -// of non-contiguous row. -// In addition, the 'total_rows_in_prev_levels' tells us how far in that array -// to look. -// -// [start_level, end_level) tell us which entries in the rowMap we will go -// through in this kernel invocation. -__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1) -amd_spts_vector_solve( - size_t global_work_size, - const FPTYPE * __restrict__ vals, - const int * __restrict__ cols, - const int * __restrict__ rowPtrs, - const FPTYPE * __restrict__ vec_x, - FPTYPE * out_y, - const FPTYPE alpha, - const unsigned int * __restrict__ rowMap, - const unsigned int * __restrict__ totalRowsInEachLevel, - const unsigned int total_rows_in_prev_levels, - const unsigned int start_level, - const unsigned int end_level) -{ - if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return; - __shared__ FPTYPE *lds_ptr; - lds_ptr = nullptr; -#ifdef LDS_REDUCTION - __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG]; - lds_ptr = lds; -#endif - __shared__ FPTYPE diagonal[WF_PER_WG]; - - // First row within this workgroup (within this group of rows) - const unsigned int first_row = hipBlockIdx_x * WF_PER_WG; - - const unsigned int gid = hipBlockIdx_x; - const unsigned int wg_lid = hipThreadIdx_x; - const unsigned int lid = wg_lid % WF_SIZE; - const unsigned int wf_id = wg_lid / WF_SIZE; - - unsigned int cur_loc_row = wf_id; - - unsigned int total_rows_seen_so_far = 0; - - // We have a single workgroup, and it is going to walk through a - // contiguous set of "levels" in the dependency graph. - for (unsigned int current_level = start_level; current_level < end_level; current_level++) - { - // Every time we reach a new level, all of the wavefronts within - // this workgroup need to have completed their row's work. - // This guarantees that we have synchronized. - __syncthreads(); - for (unsigned int cur_loc_row = wf_id; cur_loc_row < totalRowsInEachLevel[current_level]; cur_loc_row += WF_PER_WG) - { - const unsigned int entry_in_row_map = total_rows_in_prev_levels + total_rows_seen_so_far + cur_loc_row; - const unsigned int row = rowMap[entry_in_row_map]; - FPTYPE temp_sum = 0.; - - if (lid == OUTPUT_THREAD) - temp_sum = alpha * vec_x[row]; - - unsigned int start_of_this_row = rowPtrs[row]; - unsigned int end_of_this_row = rowPtrs[row+1]; - - // This thread operates on a single row, from its beginning to end. - for(unsigned int j = start_of_this_row + lid; j < end_of_this_row; j += WF_SIZE) - { - // local_col will tell us, for this iteration of the above for loop - // (i.e. for this entry in this row), which columns contain the - // non-zero values. We must then ensure that the output from the row - // associated with the local_col is complete to ensure that we can - // calculate the right answer. - int local_col = -1; - // Haul loading from vals[] up near the load of cols[] so that we get - // good coalsced loads. - FPTYPE local_val = 0.; - - // Replace the two loads below with inline assembly that sets the - // SLC bit. This forces the loads to essentially bypass the L2 - // to increase cache hit rate on other instructions. Vals and cols - // are basically streamed in, so caching them doesn't help much. - //local_col = cols[j]; - //local_val = vals[j]; -#ifdef USE_DOUBLE - __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dwordx2 %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j])); -#else - __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dword %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j])); -#endif - - // diagonal. Skip this, we need to solve for it. - if (local_col == row) - diagonal[wf_id] = local_val; - else - { - FPTYPE out_val; - out_val = out_y[local_col]; - temp_sum -= local_val * out_val; - } - } - - // Take all of the temp_sum values and add them together into - // OUTPUT_THREAD's temp_sum value. - temp_sum = cross_lane_reduction(temp_sum, lds_ptr, - start_of_this_row, end_of_this_row, wg_lid); - - // y = (x-sum_of_vals_from_A) / diag - if (lid == OUTPUT_THREAD) - { -#ifndef LDS_REDUCTION - // Wait for local memory to quiesce for the diagonal - // LDS_REDUCTION has such waits in it already. - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); -#endif - FPTYPE out_val = temp_sum / diagonal[wf_id]; - //FPTYPE out_val = temp_sum / diagonal[wf_id]; // original divide - out_y[row] = out_val; - } - } - total_rows_seen_so_far += totalRowsInEachLevel[current_level]; - } -} - -// Solves for 'y' in the equation 'A * y = alpha * x' -// This kernel is a simplified modification of the synchronization-free kernel. -// However, it is set up to work on rows that are in a contiguous series of -// levels. As such, this must be run after the initial analysis phase has -// produced a row map. -// -// Within a level, this kernel can use multiple workgroups to work on many -// rows simultaneously. In addition, multiple levels can be in flight at once, -// and this algorithm will use the synchronization-free spin-looping to produce -// the correct answer. -// -// However, we may not want to use *just* the synchronization-free spin-looping -// approach on all rows at the same time, as many rows deep in the dependency -// graph may just end up waiting, and spinning, for a long time. This spinning -// can slow down everyone else. As such, we partially break the dependency graph -// into multiple kernel invocations. This slightly reduces the theoretical -// parallelism, but it can make some invocations much faster due to less noise. -// -// The rowMap tells us that, within a level, thread X works on row Y. -// We need this because each level of the solve can have different numbers -// of non-contiguous row. -// In addition, the 'total_rows_in_prev_levels' tells us how far in that array -// to look, since previous kernel launches completed some previous rows. -__global__ void __launch_bounds__(WF_SIZE * WF_PER_WG, 1) -amd_spts_levelsync_solve( - size_t global_work_size, - const FPTYPE * __restrict__ vals, - const int * __restrict__ cols, - const int * __restrict__ rowPtrs, - const FPTYPE * __restrict__ vec_x, - FPTYPE * __restrict__ out_y, - const FPTYPE alpha, - unsigned int * __restrict__ doneArray, - const unsigned int * __restrict__ rowMap, - const unsigned int total_rows_in_prev_levels) -{ - if (global_work_size <= hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) return; - __shared__ FPTYPE *lds_ptr; - lds_ptr = nullptr; -#ifdef LDS_REDUCTION - __shared__ FPTYPE lds[WF_SIZE*WF_PER_WG]; - lds_ptr = lds; -#endif - __shared__ FPTYPE diagonal[WF_PER_WG]; - - const unsigned int gid = hipBlockIdx_x; - const unsigned int wg_lid = hipThreadIdx_x; - const unsigned int lid = wg_lid % WF_SIZE; - const unsigned int wf_id = wg_lid / WF_SIZE; - - const unsigned int row = rowMap[total_rows_in_prev_levels + (gid * WF_PER_WG) + wf_id]; - - FPTYPE temp_sum = 0.; - - if (lid == OUTPUT_THREAD) - temp_sum = alpha * vec_x[row]; - unsigned int start_of_this_row = rowPtrs[row]; - unsigned int end_of_this_row = rowPtrs[row+1]; - unsigned int start_point = start_of_this_row+lid; - - // This wavefront operates on a single row, from its beginning to end. - for(unsigned int j = start_point; j < end_of_this_row; j+=WF_SIZE) - { - // local_col will tell us, for this iteration of the above for loop - // (i.e. for this entry in this row), which columns contain the - // non-zero values. We must then ensure that the output from the row - // associated with the local_col is complete to ensure that we can - // calculate the right answer. - int local_col = -1; - // Haul loading from vals[] up near the load of cols[] so that we get - // good coalsced loads. - FPTYPE local_val = 0.; - unsigned int local_done = 0; - - // Replace the two loads below with inline assembly that sets the - // SLC bit. This forces the loads to essentially bypass the L2 - // to increase cache hit rate on other instructions. Vals and cols - // are basically streamed in, so caching them doesn't help much. - // local_col = cols[j]; - // local_val = vals[j]; -#ifdef USE_DOUBLE - __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dwordx2 %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j])); -#else - __asm__ volatile (MEM_PREFIX"_load_dword %0 %2 " OFF_MODIFIER " slc\n" MEM_PREFIX"_load_dword %1 %3 " OFF_MODIFIER " slc\ns_waitcnt vmcnt(0)" : "=v"(local_col), "=v"(local_val) : "v"(&cols[j]), "v"(&vals[j])); -#endif - - // diagonal. Skip this, we need to solve for it. - if (local_col == row) - { - local_done = 1; - diagonal[wf_id] = local_val; - } - - // While there are threads in this workgroup that have been unable to - // get their input, loop and wait for the flag to exist. - __asm__ volatile ("s_setprio 0"); - while (!local_done) - { - { - // Replace this atomic with an assembly load with GLC bit set. - // This forces the load to go to the coherence point, allowing - // us to avoid deadlocks. - // local_done = atomic_get_done(doneArray, local_col); - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc slc\ns_waitcnt vmcnt(0)" : "=v"(local_done) : "v"(&doneArray[local_col])); - } - if (local_done) - { - FPTYPE out_val; - __asm__ volatile ("s_setprio 1"); - // The command below is manually replaced with GCN assembly with - // the GLC bit set. This bypasses the L1, allowing us to do a - // coherent load of the variable without needing atomics. -#ifdef USE_DOUBLE - // out_val = as_double(atom_or((__global ulong *)&(out_y[local_col]), 0)); - __asm__ volatile (MEM_PREFIX"_load_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col])); -#else - // out_val = as_float(atomic_or((__global uint *)&(out_y[local_col]), 0)); - __asm__ volatile (MEM_PREFIX"_load_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : "=v"(out_val) : "v"(&out_y[local_col])); -#endif - temp_sum -= local_val * out_val; - } - } - } - __asm__ volatile ("s_setprio 1"); - // Take all of the temp_sum values and add them together into - // OUTPUT_THREAD's temp_sum value. - temp_sum = cross_lane_reduction(temp_sum, lds_ptr, start_of_this_row, - end_of_this_row, wg_lid); - // y = (x-sum_of_vals_from_A) / diag - if (lid == OUTPUT_THREAD) - { -#ifndef LDS_REDUCTION - // Wait for local memory to quiesce for the diagonal - // LDS_REDUCTION has such waits in it already. - asm volatile ("s_waitcnt lgkmcnt(0)\n\t"); -#endif - FPTYPE out_val = temp_sum / diagonal[wf_id]; - //out_y[row] = out_val; -#ifdef USE_DOUBLE - __asm__ volatile (MEM_PREFIX"_store_dwordx2 %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val)); -#else - __asm__ volatile (MEM_PREFIX"_store_dword %0 %1 " OFF_MODIFIER " glc\ns_waitcnt vmcnt(0)" : : "v" (&out_y[row]), "v"(out_val)); -#endif - //out_y[row] = temp_sum / diagonal[wf_id]; // original divide - int set_one = 1; - //doneArray[row] = 1; - __asm__ volatile (MEM_PREFIX"_store_byte %0 %1 " OFF_MODIFIER " glc\n" WAKEUP : : "v"(&doneArray[row]), "v"(set_one)); - // If you add this back in after doing a native_divide up above, - // we can get *some* of the accuracy of a full Newton-Raphson - // divide while maintaining the performance of the - // native_divide() on the critical path. - //out_y[row] = temp_sum / diagonal[wf_id]; - } -} diff --git a/projects/rocshmem/internal/continuous_integration/compile/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/compile/Jenkinsfile deleted file mode 100644 index 426a2004f8..0000000000 --- a/projects/rocshmem/internal/continuous_integration/compile/Jenkinsfile +++ /dev/null @@ -1,118 +0,0 @@ -pipeline { - agent { label 'sv-pdp-5' } - environment { - HSA_FORCE_FINE_GRAIN_PCIE = 1 - MPI_HOME="/home/resperf/mpich-4.0.1/install/global" - PATH = "$MPI_HOME/bin:$PATH" - LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH" - build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}" - CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake" - } - stages { - stage('Synchronize Source Code') { - steps { - checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]] - } - } - stage('Make Build Directory') { - steps { - dir("library") { - sh "mkdir -p ${build_dir}" - } - } - } - stage('Build Source Code') { - parallel { - stage('RC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI_WF_COAL") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi_wf_coal install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - } - } - - stage('RC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_MULTI") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - } - } - - stage('DC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - } - } - - stage('DC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - } - } - } - } - } -} diff --git a/projects/rocshmem/internal/continuous_integration/compile/compile_config.xml b/projects/rocshmem/internal/continuous_integration/compile/compile_config.xml deleted file mode 100644 index ccaf58b58a..0000000000 --- a/projects/rocshmem/internal/continuous_integration/compile/compile_config.xml +++ /dev/null @@ -1,93 +0,0 @@ - - - - - - - - - - - - - false - - - - - - - - PLAIN - rsch/ec/shmem - - - PLAIN - amd-master - - - false - - - - - true - true - true - true - true - - false - false - - false - true - PLAIN - - BASE64 - PLAIN - BASE64 - - - - - - - - - amd-gerrit - - - !COMPILE - - - false - - - - - - - - - 2 - - - ssh://gerritgit/rsch/ec/shmem - - - - - FETCH_HEAD - - - false - - - - internal/continuous_integration/compile/Jenkinsfile - false - - - false - \ No newline at end of file diff --git a/projects/rocshmem/internal/continuous_integration/gdb/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/gdb/Jenkinsfile deleted file mode 100644 index 48106a3a5b..0000000000 --- a/projects/rocshmem/internal/continuous_integration/gdb/Jenkinsfile +++ /dev/null @@ -1,221 +0,0 @@ -pipeline { - agent { label 'sv-pdp-5' } - environment { - HSA_FORCE_FINE_GRAIN_PCIE = 1 - MPI_HOME="/home/resperf/mpich-4.0.1/install/global" - PATH = "$MPI_HOME/bin:$PATH" - LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH" - build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}" - CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake" - } - stages { - stage('Synchronize Source Code') { - steps { - checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]] - } - } - stage('Env Variables') { - steps { - sh 'printenv' - } - } - stage('Make Build Directory') { - steps { - dir("library") { - sh "mkdir -p ${build_dir}" - } - } - } - stage('Build Source Code') { - - failFast true - - parallel { - stage('RC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/RC_SINGLE") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - //} - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI_WF_COAL") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi_wf_coal install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - //} - } - } - - stage('RC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/RC_MULTI") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - //} - } - } - - stage('DC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/DC_SINGLE") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - //} - } - } - - stage('DC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/DC_MULTI") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - //} - } - } - } - } - stage('Run Tests') { - stages { - stage('RC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE true' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE' - //} - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL true' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL' - //} - } - } - - stage('RC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI true' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI' - //} - } - } - - stage('DC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE true' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE' - //} - } - } - - stage('DC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI true' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI' - //} - } - } - - stage('RO_NET_BASIC') { - // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI - steps { - dir("clients/functional_tests") { - sh 'mkdir -p ${build_dir}/RO_NET_BASIC' - sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC true' - } - //dir("internal/clients/spts") { - // sh 'mkdir -p ${build_dir}/RO_NET_BASIC' - // sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC' - //} - } - } - } - } - stage('Generate Checker Metadata') { - steps { - dir("library/${build_dir}") { - sh 'git fetch --tags' - sh 'git log --pretty=oneline remotes/origin/amd-master.. > changeset_delta.txt' - sh 'git log --pretty=oneline remotes/origin/amd-master~1..remotes/origin/amd-master >> changeset_delta.txt' - } - } - } - stage('Archive Artifacts') { - steps { - dir("library/${build_dir}") { - archiveArtifacts artifacts: 'changeset_delta.txt' - } - dir("clients/functional_tests/${build_dir}") { - archiveArtifacts artifacts: 'RC_SINGLE/**/*.log' - archiveArtifacts artifacts: 'RC_MULTI/**/*.log' - archiveArtifacts artifacts: 'DC_SINGLE/**/*.log' - archiveArtifacts artifacts: 'DC_MULTI/**/*.log' - archiveArtifacts artifacts: 'RO_NET_BASIC/**/*.log' - } - } - } - } -} diff --git a/projects/rocshmem/internal/continuous_integration/long/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/long/Jenkinsfile deleted file mode 100644 index 77c3420784..0000000000 --- a/projects/rocshmem/internal/continuous_integration/long/Jenkinsfile +++ /dev/null @@ -1,413 +0,0 @@ -pipeline { - agent { label 'sv-pdp-5' } - environment { - build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}" - - MPI_HOME="/home/resperf/mpich-4.0.1/install/global" - UCX_HOME="/home/resperf/ucx/install" - - PATH="$MPI_HOME/bin:$UCX_HOME/bin:$PATH" - LD_LIBRARY_PATH="$MPI_HOME/lib:$UCX_HOME/lib:$LD_LIBRARY_PATH" - PKG_CONFIG_PATH="$MPI_HOME/lib/pkgconfig:$UCX_HOME/lib/pkgconfig" - - CMAKE_PREFIX_PATH="/opt/rocm/lib/cmake" - - UCX_WARN_UNUSED_ENV_VARS="n" - HSA_FORCE_FINE_GRAIN_PCIE=1 - } - stages { - stage('Synchronize Source Code') { - steps { - checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]] - } - } - stage('Env Variables') { - steps { - sh 'printenv' - } - } - stage('Make Build Directory') { - steps { - dir("library") { - sh "mkdir -p ${build_dir}" - } - } - } - stage('Build Source Code') { - - failFast true - - parallel { - stage('RC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - dir("clients/sos_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI_WF_COAL") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi_wf_coal install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - dir("clients/sos_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - } - } - - stage('RC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - dir("clients/sos_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_MULTI") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - } - } - - stage('RC_SINGLE_DEBUG') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE_DEBUG") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single_debug install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE_DEBUG") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install' - } - dir("clients/sos_tests/${build_dir}/RC_SINGLE_DEBUG") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_SINGLE_DEBUG") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install' - } - } - } - - stage('RC_SINGLE_PROFILE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE_PROFILE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single_profile install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE_PROFILE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install' - } - dir("clients/sos_tests/${build_dir}/RC_SINGLE_PROFILE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_SINGLE_PROFILE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install' - } - } - } - - stage('DC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - dir("clients/sos_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - } - } - - stage('DC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - dir("clients/sos_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - } - } - - stage('DC_MULTI_IPC') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI_IPC") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi_ipc install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI_IPC") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install' - } - dir("clients/sos_tests/${build_dir}/DC_MULTI_IPC") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI_IPC") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install' - } - } - } - - stage('DC_MULTI_DEBUG') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI_DEBUG") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi_debug install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI_DEBUG") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install' - } - dir("clients/sos_tests/${build_dir}/DC_MULTI_DEBUG") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI_DEBUG") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install' - } - } - } - - stage('DC_MULTI_PROFILE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI_PROFILE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi_profile install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI_PROFILE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install' - } - dir("clients/sos_tests/${build_dir}/DC_MULTI_PROFILE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI_PROFILE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install' - } - } - } - } - } - stage('Run Tests') { - stages { - stage('RC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE all ${build_dir}/RC_SINGLE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE' - } - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL all ${build_dir}/RC_MULTI_WF_COAL' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL' - } - } - } - - stage('RC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI' - } - } - } - - stage('RC_SINGLE_DEBUG') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_DEBUG' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG all ${build_dir}/RC_SINGLE_DEBUG' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/spts single_thread ${build_dir}/RC_SINGLE_DEBUG' - } - } - } - - stage('RC_SINGLE_PROFILE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_PROFILE' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE all ${build_dir}/RC_SINGLE_PROFILE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/spts single_thread ${build_dir}/RC_SINGLE_PROFILE' - } - } - } - - stage('DC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/DC_SINGLE all ${build_dir}/DC_SINGLE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE' - } - } - } - - stage('DC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/DC_MULTI all ${build_dir}/DC_MULTI' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI' - } - } - } - - stage('DC_MULTI_IPC') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_IPC/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_IPC' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_IPC all ${build_dir}/DC_MULTI_IPC' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI_IPC/spts multi_thread ${build_dir}/DC_MULTI_IPC' - } - } - } - - stage('DC_MULTI_DEBUG') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_DEBUG' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_DEBUG all ${build_dir}/DC_MULTI_DEBUG' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/spts multi_thread ${build_dir}/DC_MULTI_DEBUG' - } - } - } - - stage('DC_MULTI_PROFILE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_PROFILE' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_PROFILE all ${build_dir}/DC_MULTI_PROFILE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/spts multi_thread ${build_dir}/DC_MULTI_PROFILE' - } - } - } - - stage('RO_NET_BASIC') { - // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI - steps { - dir("clients/functional_tests") { - sh 'mkdir -p ${build_dir}/RO_NET_BASIC' - sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC' - } - dir("clients/sos_tests") { - sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RC_MULTI all ${build_dir}/RC_MULTI' - } - dir("internal/clients/spts") { - sh 'mkdir -p ${build_dir}/RO_NET_BASIC' - sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC' - } - } - } - } - } - } -} diff --git a/projects/rocshmem/internal/continuous_integration/long/long_config.xml b/projects/rocshmem/internal/continuous_integration/long/long_config.xml deleted file mode 100644 index aa49b2d399..0000000000 --- a/projects/rocshmem/internal/continuous_integration/long/long_config.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - - - false - - - - - H 22 * * * - - - - - - - 2 - - - ssh://gerritgit/rsch/ec/shmem - - - - - */amd-master - - - false - - - - internal/continuous_integration/nightly/Jenkinsfile - false - - - false - \ No newline at end of file diff --git a/projects/rocshmem/internal/continuous_integration/nightly/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/nightly/Jenkinsfile deleted file mode 100644 index e4e8e0284a..0000000000 --- a/projects/rocshmem/internal/continuous_integration/nightly/Jenkinsfile +++ /dev/null @@ -1,335 +0,0 @@ -pipeline { - agent { label 'sv-pdp-5' } - environment { - HSA_FORCE_FINE_GRAIN_PCIE = 1 - MPI_HOME="/home/resperf/mpich-4.0.1/install/global" - PATH = "$MPI_HOME/bin:$PATH" - LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH" - build_dir = "builds/${BUILD_ID}" - CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake" - } - stages { - stage('Synchronize Source Code') { - steps { - git branch: 'amd-master', changelog: false, poll: false, url: 'ssh://gerritgit/rsch/ec/shmem' - } - } - stage('Make Build Directory') { - steps { - dir("library") { - sh "mkdir -p ${build_dir}" - } - } - } - stage('Build Source Code') { - parallel { - stage('RC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI_WF_COAL") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi_wf_coal install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - } - } - - stage('RC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_MULTI") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - } - } - - stage('RC_SINGLE_DEBUG') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE_DEBUG") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single_debug install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE_DEBUG") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_SINGLE_DEBUG") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_DEBUG/install' - } - } - } - - stage('RC_SINGLE_PROFILE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE_PROFILE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single_profile install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE_PROFILE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/RC_SINGLE_PROFILE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE_PROFILE/install' - } - } - } - - stage('DC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - } - } - - stage('DC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - } - } - - stage('DC_MULTI_IPC') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI_IPC") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi_ipc install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI_IPC") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI_IPC") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_IPC/install' - } - } - } - - stage('DC_MULTI_DEBUG') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI_DEBUG") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi_debug install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI_DEBUG") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI_DEBUG") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_DEBUG/install' - } - } - } - - stage('DC_MULTI_PROFILE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI_PROFILE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi_profile install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI_PROFILE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install' - } - //===================== SPTS ========================== - dir("internal/clients/spts/${build_dir}/DC_MULTI_PROFILE") { - sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI_PROFILE/install' - } - } - } - } - } - stage('Run Tests') { - stages { - stage('RC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE' - } - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL' - } - } - } - - stage('RC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI' - } - } - } - - stage('RC_SINGLE_DEBUG') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_DEBUG' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_SINGLE_DEBUG/spts single_thread ${build_dir}/RC_SINGLE_DEBUG' - } - } - } - - stage('RC_SINGLE_PROFILE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE_PROFILE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/RC_SINGLE_PROFILE/spts single_thread ${build_dir}/RC_SINGLE_PROFILE' - } - } - } - - stage('DC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE' - } - } - } - - stage('DC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI' - } - } - } - - stage('DC_MULTI_IPC') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_IPC/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_IPC' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI_IPC/spts multi_thread ${build_dir}/DC_MULTI_IPC' - } - } - } - - stage('DC_MULTI_DEBUG') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_DEBUG' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI_DEBUG/spts multi_thread ${build_dir}/DC_MULTI_DEBUG' - } - } - } - - stage('DC_MULTI_PROFILE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI_PROFILE' - } - dir("internal/clients/spts") { - sh './driver.sh ${build_dir}/DC_MULTI_PROFILE/spts multi_thread ${build_dir}/DC_MULTI_PROFILE' - } - } - } - - stage('RO_NET_BASIC') { - // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI - steps { - dir("clients/functional_tests") { - sh 'mkdir -p ${build_dir}/RO_NET_BASIC' - sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC' - } - dir("internal/clients/spts") { - sh 'mkdir -p ${build_dir}/RO_NET_BASIC' - sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RO_NET_BASIC' - } - } - } - } - } - } -} diff --git a/projects/rocshmem/internal/continuous_integration/nightly/nightly_config.xml b/projects/rocshmem/internal/continuous_integration/nightly/nightly_config.xml deleted file mode 100644 index aa49b2d399..0000000000 --- a/projects/rocshmem/internal/continuous_integration/nightly/nightly_config.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - - - false - - - - - H 22 * * * - - - - - - - 2 - - - ssh://gerritgit/rsch/ec/shmem - - - - - */amd-master - - - false - - - - internal/continuous_integration/nightly/Jenkinsfile - false - - - false - \ No newline at end of file diff --git a/projects/rocshmem/internal/continuous_integration/short/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/short/Jenkinsfile deleted file mode 100644 index 56b22d05d5..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/Jenkinsfile +++ /dev/null @@ -1,288 +0,0 @@ -pipeline { - agent { label 'sv-pdp-7' } - environment { - build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}" - - MPI_HOME="/home/resperf/mpich/install" - UCX_HOME="/home/resperf/ucx/install" - - PATH="$MPI_HOME/bin:$UCX_HOME/bin:$PATH" - LD_LIBRARY_PATH="$MPI_HOME/lib:$UCX_HOME/lib:$LD_LIBRARY_PATH" - PKG_CONFIG_PATH="$MPI_HOME/lib/pkgconfig:$UCX_HOME/lib/pkgconfig" - - CMAKE_PREFIX_PATH="/opt/rocm/lib/cmake" - - UCX_WARN_UNUSED_ENV_VARS="n" - HSA_FORCE_FINE_GRAIN_PCIE=1 - UCX_TLS="rc" - ROCSHMEM_USE_SQ_GPU_MEM=0 - ROCSHMEM_USE_CQ_GPU_MEM=0 - ROCSHMEM_NUM_BLOCKS=128 - } - stages { - stage('Synchronize Source Code') { - steps { - checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]] - } - } - stage('Env Variables') { - steps { - sh 'printenv' - } - } - stage('Make Build Directory') { - steps { - dir("library") { - sh "mkdir -p ${build_dir}" - } - } - } - stage('Build Source Code') { - - failFast true - - parallel { - stage('RC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - dir("clients/sos_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/RC_SINGLE") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - //} - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI_WF_COAL") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi_wf_coal install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - dir("clients/sos_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/RC_MULTI_WF_COAL") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - //} - } - } - - stage('RC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - dir("clients/sos_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/RC_MULTI") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - //} - } - } - - stage('DC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - dir("clients/sos_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/DC_SINGLE") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - //} - } - } - - stage('DC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - dir("clients/sos_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/DC_MULTI") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - //} - } - } - - stage('RO_NET') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RO_NET") { - sh 'mkdir -p install' - sh '../../../build_configs/ro_net install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RO_NET") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RO_NET/install' - } - dir("clients/sos_tests/${build_dir}/RO_NET") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RO_NET/install' - } - //===================== SPTS ========================== - //dir("internal/clients/spts/${build_dir}/RO_NET") { - // sh '../../../build_configs/analyze_single_rocshmem ${WORKSPACE}/library/${build_dir}/RO_NET/install' - //} - } - } - } - } - stage('Run Tests') { - stages { - stage('RC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE short ${build_dir}/RC_SINGLE' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/RC_SINGLE/spts single_thread ${build_dir}/RC_SINGLE' - //} - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL short ${build_dir}/RC_MULTI_WF_COAL' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/spts multi_thread ${build_dir}/RC_MULTI_WF_COAL' - //} - } - } - - stage('RC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/RC_MULTI short ${build_dir}/RC_MULTI' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/RC_MULTI/spts multi_thread ${build_dir}/RC_MULTI' - //} - } - } - - stage('DC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/DC_SINGLE short ${build_dir}/DC_SINGLE' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/DC_SINGLE/spts single_thread ${build_dir}/DC_SINGLE' - //} - } - } - - stage('DC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI' - } - dir("clients/sos_tests") { - sh './driver.sh ${build_dir}/DC_MULTI short ${build_dir}/DC_MULTI' - } - //dir("internal/clients/spts") { - // sh './driver.sh ${build_dir}/DC_MULTI/spts multi_thread ${build_dir}/DC_MULTI' - //} - } - } - - stage('RO_NET') { - steps { - dir("clients/functional_tests") { - sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/rocshmem_example_driver ro ${build_dir}/RO_NET' - } - dir("clients/sos_tests") { - sh 'ROCSHMEM_RO=1 ./driver.sh ${build_dir}/RO_NET short ${build_dir}/RO_NET' - } - //dir("internal/clients/spts") { - // sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RO_NET/spts multi_thread ${build_dir}/RO_NET' - //} - } - } - } - } - stage('Generate Checker Metadata') { - steps { - dir("library/${build_dir}") { - sh 'git fetch --tags' - sh 'git log --pretty=oneline remotes/origin/amd-master.. > changeset_delta.txt' - sh 'git log --pretty=oneline remotes/origin/amd-master~1..remotes/origin/amd-master >> changeset_delta.txt' - } - } - } - stage('Archive Artifacts') { - steps { - dir("library/${build_dir}") { - archiveArtifacts artifacts: 'changeset_delta.txt' - } - dir("clients/functional_tests/${build_dir}") { - archiveArtifacts artifacts: 'RC_SINGLE/**/*.log' - archiveArtifacts artifacts: 'RC_MULTI/**/*.log' - archiveArtifacts artifacts: 'DC_SINGLE/**/*.log' - archiveArtifacts artifacts: 'DC_MULTI/**/*.log' - archiveArtifacts artifacts: 'RO_NET/**/*.log' - } - } - } - } - post { - success { - build job: 'shmem_perf_check', wait: true - } - } -} diff --git a/projects/rocshmem/internal/continuous_integration/short/__init__.py b/projects/rocshmem/internal/continuous_integration/short/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/projects/rocshmem/internal/continuous_integration/short/absolute_path.py b/projects/rocshmem/internal/continuous_integration/short/absolute_path.py deleted file mode 100644 index 7d11a7ce44..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/absolute_path.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import glob -import pprint - -class PathGlobber(): - def __init__(self, name, *partial_paths_to_concatenate): - self._search_path = '' - for partial_path in partial_paths_to_concatenate: - self._search_path += partial_path - self.dirs = [] - self._name = name - - def generate(self): - self.dirs = glob.glob(self._search_path, recursive=True) - - def dump(self): - str_out = self._name - str_out += pprint.pformat(self.dirs, width=120) - str_out += '\n' - return str_out diff --git a/projects/rocshmem/internal/continuous_integration/short/archive_path.py b/projects/rocshmem/internal/continuous_integration/short/archive_path.py deleted file mode 100644 index 1df07aa631..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/archive_path.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import absolute_path -import glob - -class Archive(absolute_path.PathGlobber): - def __init__(self, args, name=''): - archive_path = args.archive_path - super().__init__(name, args.jenkins_path, archive_path, - args.benchmark_path) - - def path_of_build(self, build_id): - path = self._search_path.replace('*/archive', build_id + '/archive') - path = glob.glob(path) - return path[0] diff --git a/projects/rocshmem/internal/continuous_integration/short/check_perf_delta.py b/projects/rocshmem/internal/continuous_integration/short/check_perf_delta.py deleted file mode 100755 index be09e8cab4..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/check_perf_delta.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import parser -import dictionary -import archive_path -import checker - -def main(): - # This script accepts command line values, but has reasonable defaults - # needed to run as part of the CI infrastructure. - p = parser.Parser() - args = p.parse_command_line() - - # Jenkins is configured to archive build artifacts in a directory. - # The 'archives' variable holds the set of directories for - # successful Jenkins builds (those which run to completion). - # Partitioning of successful builds is useful since we can ignore - # failed build directories while searching for performance data. - archives = archive_path.Archive(args) - archives.generate() - print(archives.dump()) - - # Jenkins records changeset information in a changeset_delta.txt file. - # We parse the changelog for the commit hash and save it into - # 'builds_to_changesets'. - build_to_changeset = dictionary.BuildToChangesetDict() - build_to_changeset.generate(archives.dirs) - print(build_to_changeset.dump()) - - # 'changeset_to_build' holds the changeset mappings with a - # list of build numbers that match the changeset value. - # Builds may be executed many times with the same changeset. - # The most recent build (identified by the largest build number) will - # be used to retrieve performance data. - changeset_to_build = dictionary.ChangesetToBuildDict() - changeset_to_build.generate(build_to_changeset) - print(changeset_to_build.dump()) - - # Jenkins is configured to dump Gerrit-esque relation chain changesets - # to an archived output file 'changeset-delta.txt'. - # The relation chain will be used to determine changeset performance - # data for each changeset in the relation chain (when possible). - build_to_relation_chain = dictionary.BuildToRelationChainDict() - build_to_relation_chain.generate(archives.dirs) - print(build_to_relation_chain.dump()) - - perf_checker = checker.Performance(args, - archives, - changeset_to_build, - build_to_relation_chain) - perf_checker.run() - -if __name__ == '__main__': - main() diff --git a/projects/rocshmem/internal/continuous_integration/short/checker.py b/projects/rocshmem/internal/continuous_integration/short/checker.py deleted file mode 100644 index 3305115a56..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/checker.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import archive_path -import log -import dictionary -import report -import violation - -class Performance(): - def __init__(self, args, archives, changeset_to_build, - build_to_relation_chain): - self._args = args - self._archives = archives - self._changeset_to_build = changeset_to_build - self._build_to_relation_chain = build_to_relation_chain - self._build_id = build_to_relation_chain.most_recent_build() - self._archive_path = archives.path_of_build(self._build_id) - self._output = report.Report(self._build_id, - self._archive_path, - 'performance_diff.txt') - - def _other_build_id(self, other_changeset): - packed_id = [build_id for chng, - build_id in self._changeset_to_build.data.items() - if chng.startswith(other_changeset)] - - # The 'packed_id' variable is a list containing lists. - # We need the content inside the packed_id data structure. - try: - build_id = packed_id[0][0] - return True, build_id - except IndexError: - # An index error can occur if builds in the relation chain - # have not been tested before attempting to test this - # changeset. - return False, 0 - - def _log_difference(self, log_filename, other_changeset, - other_archive_path, violations): - print('determining difference of log file ' + log_filename) - self._output.record(log_filename) - - current_file_path = self._archive_path + '/' + log_filename - other_file_path = other_archive_path + '/' + log_filename - log_pair = log.Pair(current_file_path, other_file_path) - log_pair.calculate_differences() - - latency_perc = [float(i.strip('%')) \ - for i in log_pair.latency_percentage_differences] - max_latency = max(latency_perc) - violations.check(max_latency, other_changeset, log_filename) - - self._output.record(log_pair.dump()) - - def _changeset_difference(self, current_changeset, other_changeset): - violations = violation.Threshold(self._args.latency_max, 'latency') - - change_pair = '(' + current_changeset + ',' + other_changeset + ')' - print('comparing changesets ' + change_pair) - self._output.record(change_pair) - - status, other_build_id = self._other_build_id(other_changeset) - if status == False: - message = 'skipping changeset ' + other_changeset - print(message) - self._output.record(message) - return violations - - other_archive_path = self._archives.path_of_build(other_build_id) - print(self._archive_path) - print(other_archive_path) - - for filename in self._args.logs: - self._log_difference(filename, other_changeset, - other_archive_path, violations) - print('\n') - - return violations - - def _calculate_performance_differences(self): - current_changeset = \ - self._build_to_relation_chain.data[self._build_id][0] - other_changesets = \ - self._build_to_relation_chain.data[self._build_id][1:] - - for other_changeset in other_changesets: - violations = self._changeset_difference(current_changeset, - other_changeset) - - # Only report on the last pairwise changeset combination. - # This combination represents the changeset being tested and - # the amd-master:HEAD. - violations.provide_violations_to_report(self._output) - - def run(self): - self._output.open() - self._calculate_performance_differences() diff --git a/projects/rocshmem/internal/continuous_integration/short/dictionary.py b/projects/rocshmem/internal/continuous_integration/short/dictionary.py deleted file mode 100644 index f7861320b2..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/dictionary.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import abc -import os -import pprint -import subprocess -import sys - -class BaseDict(metaclass=abc.ABCMeta): - def __init__(self): - self.data = {} - self._delimiter_path = 'archive' - self._changeset_delta_filename = 'changeset_delta.txt' - - def _build_id(self, build_directory): - sub_directory_strings = build_directory.split('/') - word_count = 0 - for word in sub_directory_strings: - if word == '': - continue - if word == self._delimiter_path: - break - word_count += 1 - bld_id = sub_directory_strings[word_count] - return bld_id - - def _open_changeset_delta_file(self, archive_directory): - build_directory, config_directory = os.path.split(archive_directory) - changeset_file_path = build_directory + '/' + \ - self._changeset_delta_filename - try: - file_handle = open(changeset_file_path, 'r') - except: - sys.exit('failed to open: ' + changeset_file_path) - return file_handle - - @abc.abstractmethod - def _changeset_delta_operations(self, file_handle, bld_id): - pass - - def generate(self, archives): - for d in archives: - bld_id = self._build_id(d) - f = self._open_changeset_delta_file(d) - self._changeset_delta_operations(f, bld_id) - - def most_recent_build(self): - build_id_strings = self.data.keys() - build_id_ints = list(map(int, build_id_strings)) - most_recent_build_id_int = max(build_id_ints) - return str(most_recent_build_id_int) - - def dump(self): - str_out = self._print_text - str_out += pprint.pformat(self.data, width=120) - str_out += '\n' - return str_out - -class BuildToChangesetDict(BaseDict): - def __init__(self, name=''): - super().__init__() - self._print_text = name - - def _changeset_delta_operations(self, file_handle, bld_id): - commit_line = file_handle.readline() - try: - commit_hash = commit_line.split()[0] - except IndexError: - commit_hash = None - if commit_hash != None: - self.data[bld_id] = commit_hash - -class BuildToRelationChainDict(BaseDict): - def __init__(self, name=''): - super().__init__() - self._print_text = name - - def _changeset_delta_operations(self, file_handle, bld_id): - changes = [] - for line in file_handle: - changes.append(line.split()[0]) - self.data[bld_id] = changes - -class ChangesetToBuildDict(): - def __init__(self, name=''): - self.data = {} - self._print_text = name - - def _invert_dict(self, dictionary): - dict_with_duplicates = {} - for key, value in dictionary.data.items(): - list_with_duplicates = dict_with_duplicates.get(value, []) - list_with_duplicates.append(key) - dict_with_duplicates[value] = list_with_duplicates - return dict_with_duplicates - - def generate(self, dictionary): - self.data = self._invert_dict(dictionary) - - def dump(self): - str_out = self._print_text - str_out += pprint.pformat(self.data, width=120) - str_out += '\n' - return str_out - -class ChangelogToMostRecentBuild(): - def __init__(self, name=''): - self._print_text = name - self._all_changesets = [] - self._changesets_with_builds = [] - self._changesets_without_builds = [] - self.data = {} - - def _build_id(self, changeset_to_build, changeset): - try: - build_id_strings = changeset_to_build.data[changeset] - build_id_ints = list(map(int, build_id_strings)) - most_recent_build_id_int = max(build_id_ints) - build_id_str = str(most_recent_build_id_int) - except: - build_id_str = '' - return build_id_str - - def _changelog(self): - # print git hash along with file modification stats - shellcmd = 'git log --pretty=tformat:"%H" --shortstat | ' - # condense the output down to single line - shellcmd += "awk 'ORS=NR%3?\" \":\"\\n\"' | " - # parse out the git hash by itself - shellcmd += "awk '{print $1}'" - x = subprocess.getoutput(shellcmd) - self._all_changesets = x.split() - - def _with_builds(self, changeset_to_build): - changesets = list(changeset_to_build.data.keys()) - self._changesets_with_builds = changesets - - def _without_builds(self): - self._changesets_without_builds = \ - list(set(self._all_changesets) - \ - set(self._changesets_with_builds)) - - def generate(self, changeset_to_build): - self._changelog() - self._with_builds(changeset_to_build) - self._without_builds() - for changeset in self._all_changesets: - if changeset in self._changesets_with_builds: - build = self._build_id(changeset_to_build, changeset) - self.data[changeset] = build - - def dump(self): - str_out = self._print_text - str_out += 'git-log_changesets_in_order:\n' - str_out += pprint.pformat(self._all_changesets, width=120) - str_out += '\nfilesystem_with_builds:\n' - str_out += pprint.pformat(self._changesets_with_builds, width=120) - str_out += '\nfilesystem_without_builds:\n' - str_out += pprint.pformat(self._changesets_without_builds, width=120) - str_out += '\ngit-log_changesets_to_build-id_mappings:\n' - str_out += pprint.pformat(self.data, width=120) - str_out += '\n' - return str_out diff --git a/projects/rocshmem/internal/continuous_integration/short/log.py b/projects/rocshmem/internal/continuous_integration/short/log.py deleted file mode 100644 index df63301f84..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/log.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import pprint -import re -import sys - -class Log(): - def __init__(self, logfile_abspath): - self._file_path = logfile_abspath - self.latency = [] - self.bandwidth = [] - # regex matches the latency and bandwidth lines in the log files - self._regex = '.*[0-9]+\.[0-9]+.*[0-9]\.[0-9].*' - - def open(self): - try: - self._file_handle = open(self._file_path, 'r') - except: - sys.exit('failed to open: ' + self._file_path) - - def parse(self): - for line in self._file_handle: - if re.match(self._regex, line): - entries = line.split() - self.latency.append(round(float(entries[0]), 4)) - self.bandwidth.append(round(float(entries[1]), 4)) - -class Pair(): - def __init__(self, first_logfile_abspath, second_logfile_abspath): - self.first = Log(first_logfile_abspath) - self.first.open() - self.first.parse() - self.second = Log(second_logfile_abspath) - self.second.open() - self.second.parse() - - def _ratio(self, a, b): - diff = [round((x - y), 4) for x, y in zip(a, b)] - ratio = [] - for numerator, denominator in zip(diff, a): - try: - ratio.append(round(numerator / denominator, 4)) - except: - ratio.append(float(0.0000)) - return ratio - - def _percent(self, ratio): - perc = ['{0:.2%}'.format(x) for x in ratio] - return perc - - def _percentage_difference(self, a, b): - ratio = self._ratio(a, b) - percent = self._percent(ratio) - return percent - - def calculate_differences(self): - self.latency_percentage_differences = \ - self._percentage_difference(self.first.latency, - self.second.latency) - self.bandwidth_percentage_differences = \ - self._percentage_difference(self.first.bandwidth, - self.second.bandwidth) - - def dump(self): - delim = ', ' - output = '\tlatency:' - output += '\n\t\t' - output += delim.join(map(str, self.first.latency)) - output += '\n\t\t' - output += delim.join(map(str, self.second.latency)) - output += '\n\t\t' - output += delim.join(map(str, self.latency_percentage_differences)) - output += '\n\tbandwidth:' - output += '\n\t\t' - output += delim.join(map(str, self.first.bandwidth)) - output += '\n\t\t' - output += delim.join(map(str, self.second.bandwidth)) - output += '\n\t\t' - output += delim.join(map(str, self.bandwidth_percentage_differences)) - return output - -class Tracker(): - def __init__(self, args, archives): - self._args = args - self._archives = archives - self._data = {} - - def add(self, changeset, most_recent_build_id): - archive_path = self._archives.path_of_build(most_recent_build_id) - for filename in self._args.logs: - abs_file_path = archive_path + '/' + filename - log = Log(abs_file_path) - log.open() - log.parse() - key = (changeset, filename) - self._data[key] = log - - def dump(self): - out_str = '' - for key in self._data.keys(): - log = self._data[key] - line_str = pprint.pformat(key, width=120) - line_str += ' = ' - line_str += pprint.pformat(log.latency, width=120) - line_str += '\n' - out_str += line_str - return out_str diff --git a/projects/rocshmem/internal/continuous_integration/short/parser.py b/projects/rocshmem/internal/continuous_integration/short/parser.py deleted file mode 100644 index 2a122e2ac6..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/parser.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import argparse - -class Parser(): - def __init__(self): - # A parent directory containing log file output from one of the - # configuration runs. The output directories are intended to - # be symmetric in naming with the various configurations supplied - # by the library's build_configs. - self._default_config = 'RC_SINGLE' - - # The list of log files which need to be checked for performance - # differences. - self._default_logs = ['get.log', - 'get_nbi.log', - 'get_swarm.log', - 'put.log', - 'put_nbi.log'] - - # The maximum pairwise difference for the log file latencies. - self._default_latency_max = 5.0 - - # The minimum bandwidth difference for the log file bandwidths. - self._default_bandwidth_min = -50.0 - - # The Jenkins tester archives slave output on the master's - # filesystem which currently uses this top-level path (as the - # resperf account). - self._default_jenkins_path = \ - '/proj/radl_extra/users/resperf/jenkins-2.192/' - - # The performance tester runs as part of the 'short' job to - # verify that no performance degradation has occurred between - # commits. This archive path is the generic archive path - # for all of the builds. The Kleene star is used as a place - # holder for the Jenkins build number. - self._default_archive_path = \ - 'jobs/shmem_short/builds/*/archive/' - - # The default benchmark path can be used to alter archive - # output placement. Currently, this is initialized to an empty - # string, but subsequently initialized to inject the config - # path. - self._default_benchmark_path = '' - - def setup_options(self, argparser): - argparser.add_argument('-j', - dest='jenkins_path', - default=self._default_jenkins_path) - argparser.add_argument('-a', - dest='archive_path', - default=self._default_archive_path) - argparser.add_argument('-b', - dest='benchmark_path', - default=self._default_benchmark_path) - argparser.add_argument('-c', - dest='config', - default=self._default_config) - argparser.add_argument('-l', - dest='logs', - nargs='*', - default=self._default_logs) - argparser.add_argument('-x', - dest='latency_max', - type=float, - default=self._default_latency_max) - argparser.add_argument('-y', - dest='bandwidth_min', - type=float, - default=self._default_bandwidth_min) - argparser.add_argument('-o', - dest='one_changeset') - argparser.add_argument('-r', - dest='changeset_range', - nargs=2, - metavar=("most_recent_changeset", "least_recent_changeset")) - return argparser - - def parse_command_line(self): - p = argparse.ArgumentParser() - p = self.setup_options(p) - args = p.parse_args() - args.benchmark_path = args.config + args.benchmark_path - return args diff --git a/projects/rocshmem/internal/continuous_integration/short/perf_config.xml b/projects/rocshmem/internal/continuous_integration/short/perf_config.xml deleted file mode 100644 index 6b87cbe037..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/perf_config.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - shmem performance delta checker - false - - - master - false - false - false - false - - false - - - /proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c "RC_SINGLE" -x "15.0" -l put.log put_nbi.log get.log get_nbi.log amo_add.log amo_fadd.log amo_fcswap.log amo_fetch.log amo_finc.log amo_inc.log ping_pong.log -/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c "RC_MULTI" -x "15.0" -l put.log put_nbi.log get.log get_nbi.log amo_add.log amo_fadd.log amo_fcswap.log amo_fetch.log amo_finc.log amo_inc.log ping_pong.log get_swarm.log -/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c "DC_SINGLE" -x "15.0" -l put.log put_nbi.log get.log get_nbi.log ping_pong.log -/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c "DC_MULTI" -x "15.0" -l put.log put_nbi.log get.log get_nbi.log ping_pong.log get_swarm.log -/proj/radl_extra/users/resperf/jenkins-2.192/workspace/shmem_short@script/internal/continuous_integration/short/check_perf_delta.py -c "RO_NET_BASIC" -x "75.0" -l put.log put_nbi.log get.log get_nbi.log ping_pong.log - - - - - - - - - diff --git a/projects/rocshmem/internal/continuous_integration/short/plot.py b/projects/rocshmem/internal/continuous_integration/short/plot.py deleted file mode 100755 index 825d4082f7..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/plot.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import parser -import dictionary -import archive_path -import plotter - -def main(): - p = parser.Parser() - args = p.parse_command_line() - - archives = archive_path.Archive(args) - archives.generate() - print(archives.dump()) - - build_to_changeset = dictionary.BuildToChangesetDict() - build_to_changeset.generate(archives.dirs) - print(build_to_changeset.dump()) - - changeset_to_build = dictionary.ChangesetToBuildDict() - changeset_to_build.generate(build_to_changeset) - print(changeset_to_build.dump()) - - plot = plotter.Plot(args, - archives, - changeset_to_build) - - # either plot with all the changesets or the slice provided - plot.changeset_slice() - - if (args.one_changeset): - plot.one_changeset_plot() - -if __name__ == '__main__': - main() diff --git a/projects/rocshmem/internal/continuous_integration/short/plotter.R b/projects/rocshmem/internal/continuous_integration/short/plotter.R deleted file mode 100755 index 5171e29069..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/plotter.R +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env Rscript - -# load the required libraries: -library(tidyverse) -library(RColorBrewer) -library(optparse) - -# declare some helper functions -ggpreview <- function (..., device = "png") { - fname <- tempfile(fileext = paste0(".", device)) - ggplot2::ggsave(filename = fname, device = device, ...) - system2("open", fname) - invisible(NULL) -} - -set_right_order <- function(df) { - # reverse the order of the rows so that oldest commit is first - df <- df %>% map_df(rev) - # ensure that ggplot plots the x-axis in the right order - df$Commit <- factor(df$Commit, levels = unique(df$Commit)) - return(df) -} - -plot_and_save <- function(df, xval, yval, title, subtitle, xlabel, filename) { - p <- ggplot(df, aes_string(x=xval, y=yval, group=1)) + - geom_line(size = 0.5, color=mycolors[1]) + - geom_point(size = 1.5, alpha = 1, color=mycolors[2]) + - theme_minimal() + - expand_limits(y=0) + - xlab(xlabel) + - ggtitle(title, subtitle = subtitle) + - theme( - axis.text.x = element_text(angle=90,hjust=1), - axis.title.y = element_blank() - ) + - scale_fill_manual(values = mycolors) - #ggpreview(width=7.5, height=5, units="in", dpi=500) - ggsave(filename, p, device=pdf, dpi=500) -} - -## Set up options ## - -option_list = list( - make_option(c("-o", "--output"), type="character", default=NULL, action="store", - help="path (without trailing /) to a folder that will - contain the plots", metavar="folder-path"), - make_option(c("-a", "--changeset_a"), type="character", default=NULL, action="store", - help="beginning (inclusive) changeset of slice", metavar="changeset"), - make_option(c("-b", "--changeset_b"), type="character", default=NULL, action="store", - help="ending (inclusive) changeset of slice", metavar="changeset"), - make_option(c("-c", "--one_changeset"), type="character", default=NULL, action="store", - help="if set, will prepare plots for one changeset; if not, plots for a changeset slice") - -) - -## SCRIPT START ## - -# parse the options -opt_parser <- OptionParser(option_list=option_list) -opts <- parse_args(opt_parser) -if (is.null(opts$output)) { - print_help(opt_parser) - stop("Please set the --output flag.", call.=FALSE) -} -slice_opt = 0 -single_opt = 0 -if (!is.null(opts$changeset_a) && !is.null(opts$changeset_b)) { - slice_opt = 1 -} -if (!is.null(opts$one_changeset)) { - single_opt = 1 -} - -if ( (slice_opt && single_opt) || (!slice_opt && !single_opt) ) { - stop("Please supply a slice or a single changeset, not both.", call.= FALSE) -} - -# choose color palette -mycolors <- brewer.pal(5, "Set2") - -if (length(opts$one_changeset) > 0) { - ## Plotting data for a single changeset ## - - # read the files - non_amo <- read.csv("non_amo_one_changeset.csv", header=TRUE) - amo <- read.csv("amo_one_changeset.csv", header=TRUE) - ping_pong <- read.csv("ping_pong_one_changeset.csv", header=TRUE) - - # ensure that ggplot plots the x-axis in the right order - non_amo$size <- factor(non_amo$size, levels = unique(non_amo$size)) - amo$op <- factor(amo$op, levels = unique(amo$op)) - - # plot - non_amo_ops <- list("put","put_nbi","get","get_nbi") - for (op in non_amo_ops) { - plot_and_save(df=non_amo, - xval="size", - yval=op, - title=op, - subtitle="Latency (us)", - xlabel="Message size (bytes)", - filename=paste(opts$output,"/",op,"_changeset_",opts$one_changeset,".pdf", sep="") - ) - } - - # prepare data for plots with fixed message size and ops as x axis - non_amo$bsize <- paste("b",non_amo$size,sep="") # (so that the columns in non_amo_t start with a character) - non_amo_t <- setNames(data.frame(t(non_amo[,2:5])), non_amo[,6]) # transpose + set column names - non_amo_t$op <- colnames(non_amo[,2:5]) # make a column with operation names - - sizes <- colnames(non_amo_t[,-(length(colnames(non_amo_t)))]) - for (size in sizes) { - plot_and_save(df=non_amo_t, - xval="op", - yval=size, - title=paste(sub('.', '', size),"byte"), - subtitle="Latency (us)", - xlabel="Operation", - filename=paste(opts$output,"/",size,"_changeset_",opts$one_changeset,".pdf", sep="") - ) - } - - plot_and_save(df=amo, - xval="op", - yval="latency", - title="Atomics", - subtitle="Latency (us)", - xlabel="Operation", - filename=paste(opts$output,"/atomic_changeset_",opts$one_changeset,".pdf", sep="") - ) - - ping_pong$type <- c("ping_pong") - p<-ggplot(ping_pong, aes(x=type, y=latency, fill=type)) + - geom_bar(stat="identity", width=0.5) + - theme_minimal() + - ggtitle("Ping pong", subtitle = "Latency (us)") + - theme( - axis.title.y = element_blank(), - axis.text.y = element_blank(), - axis.title.x = element_blank(), - legend.position = "none" - ) + - coord_flip() + - scale_fill_manual(values = mycolors) - #ggpreview(width=7.5, height=5, units="in", dpi=500) - ggsave(paste(opts$output,"/ping_pong_changeset_",opts$one_changeset,".pdf", sep=""), p, device=pdf, dpi=500) - -} else { - ## Plotting across a changeset slice ## - - # read the files - put <- read.csv("put.csv", header=TRUE) - put_nbi <- read.csv("put_nbi.csv", header=TRUE) - get <- read.csv("get.csv", header=TRUE) - get_nbi <- read.csv("get_nbi.csv", header=TRUE) - amo <- read.csv("amo.csv", header=TRUE) - ping_pong <- read.csv("ping_pong.csv", header=TRUE) - - # slice out the commits - start <- match(c(opts$changeset_a), put$Commit) - end <- match(c(opts$changeset_b), put$Commit) - # (start and end should be the same for all the frames) # - put <- put[start:end,] - put_nbi <- put_nbi[start:end,] - get <- get[start:end,] - get_nbi <- get_nbi[start:end,] - amo <- amo[start:end,] - ping_pong <- ping_pong[start:end,] - - put <- set_right_order(put) - put_nbi <- set_right_order(put_nbi) - get <- set_right_order(get) - get_nbi <- set_right_order(get_nbi) - amo <- set_right_order(amo) - ping_pong <- set_right_order(ping_pong) - - # plot - non_amo_ops <- list("put","put_nbi","get","get_nbi") - sizes_to_subtitle_map <- list("b1"="1 byte", - "b2"="2 bytes", - "b4"="4 bytes", - "b8"="8 bytes", - "b16"="16 bytes", - "b32"="32 bytes", - "b64"="64 bytes", - "b128"="128 bytes", - "b256"="256 bytes", - "b512"="512 bytes", - "b1024"="1024 bytes", - "b2048"="2048 bytes", - "b4096"="4096 bytes", - "b8192"="8192 bytes", - "b16384"="16384 bytes", - "b32768"="32768 bytes") - for (op in non_amo_ops) { - for (size in names(sizes_to_subtitle_map)) { - plot_and_save(df=eval(parse(text=op)), - xval="Commit", - yval=size, - title=op, - subtitle=paste("Latency (us) for ",sizes_to_subtitle_map[[size]],sep=""), - xlabel="Commit (older to newer)", - filename=paste(opts$output,"/",op,"_",size,".pdf", sep="") - ) - } - } - - amo_ops <- list("add","cswap","fadd","fcswap","fetch","finc","inc") - for (op in amo_ops) { - plot_and_save(df=amo, - xval="Commit", - yval=op, - title=op, - subtitle="Latency (us)", - xlabel="Commit (older to newer)", - filename=paste(opts$output,"/",op,".pdf", sep="") - ) - } - - plot_and_save(df=ping_pong, - xval="Commit", - yval="latency", - title="ping_pong", - subtitle="Latency (us)", - xlabel="Commit (older to newer)", - filename=paste(opts$output,"/","ping_pong.pdf", sep="") - ) -} - -## SCRIPT END ## diff --git a/projects/rocshmem/internal/continuous_integration/short/plotter.py b/projects/rocshmem/internal/continuous_integration/short/plotter.py deleted file mode 100644 index 2f86bc3711..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/plotter.py +++ /dev/null @@ -1,295 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import dictionary -import log -#import matplotlib.pyplot -import numpy -import csv -import os -import subprocess -import sys - -class Plot(): - def __init__(self, args, archives, changeset_to_build): - self._args = args - self._archives = archives - self._changelog = dictionary.ChangelogToMostRecentBuild() - self._changelog.generate(changeset_to_build) - print(self._changelog.dump()) - - def abbreviate_changesets(self, changesets): - return [changeset[0:8] for changeset in changesets] - - @staticmethod - def write_dict_to_file(tracker, field_names, file_name): - with open(file_name, 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=field_names) - writer.writeheader() - writer.writerows(tracker) - - @staticmethod - def check_and_add_to_dict(dictionary, key, array): - if len(array) > 0: - dictionary[key] = array[0] - else: - dictionary[key] = 0 - - def changeset_slice(self): - self._log_tracker = log.Tracker(self._args, self._archives) - for changeset in self._changelog._all_changesets: - if changeset in self._changelog.data.keys(): - build_id = self._changelog.data[changeset] - self._log_tracker.add(changeset, build_id) - print(self._log_tracker.dump()) - - """ - separate out dictionaries based on operation - and prepare them in a format that works with - the csv module - """ - put_tracker = [] - put_nbi_tracker = [] - get_tracker = [] - get_nbi_tracker = [] - amo_tracker = [] - ping_pong_tracker = [] - prev_commit = list(self._log_tracker._data.keys())[0][0] - amo_dict = {} - for key, value in self._log_tracker._data.items(): - if (key[1] == "put.log"): - put_tracker.append({'Commit':key[0][0:7], - 'b1':value.latency[0], - 'b2':value.latency[1], - 'b4':value.latency[2], - 'b8':value.latency[3], - 'b16':value.latency[4], - 'b32':value.latency[5], - 'b64':value.latency[6], - 'b128':value.latency[7], - 'b256':value.latency[8], - 'b512':value.latency[9], - 'b1024':value.latency[10], - 'b2048':value.latency[11], - 'b4096':value.latency[12], - 'b8192':value.latency[13], - 'b16384':value.latency[14], - 'b32768':value.latency[15] - }) - if (key[1] == "put_nbi.log"): - put_nbi_tracker.append({'Commit':key[0][0:7], - 'b1':value.latency[0], - 'b2':value.latency[1], - 'b4':value.latency[2], - 'b8':value.latency[3], - 'b16':value.latency[4], - 'b32':value.latency[5], - 'b64':value.latency[6], - 'b128':value.latency[7], - 'b256':value.latency[8], - 'b512':value.latency[9], - 'b1024':value.latency[10], - 'b2048':value.latency[11], - 'b4096':value.latency[12], - 'b8192':value.latency[13], - 'b16384':value.latency[14], - 'b32768':value.latency[15] - }) - if (key[1] == "get.log"): - get_tracker.append({'Commit':key[0][0:7], - 'b1':value.latency[0], - 'b2':value.latency[1], - 'b4':value.latency[2], - 'b8':value.latency[3], - 'b16':value.latency[4], - 'b32':value.latency[5], - 'b64':value.latency[6], - 'b128':value.latency[7], - 'b256':value.latency[8], - 'b512':value.latency[9], - 'b1024':value.latency[10], - 'b2048':value.latency[11], - 'b4096':value.latency[12], - 'b8192':value.latency[13], - 'b16384':value.latency[14], - 'b32768':value.latency[15] - }) - if (key[1] == "get_nbi.log"): - get_nbi_tracker.append({'Commit':key[0][0:7], - 'b1':value.latency[0], - 'b2':value.latency[1], - 'b4':value.latency[2], - 'b8':value.latency[3], - 'b16':value.latency[4], - 'b32':value.latency[5], - 'b64':value.latency[6], - 'b128':value.latency[7], - 'b256':value.latency[8], - 'b512':value.latency[9], - 'b1024':value.latency[10], - 'b2048':value.latency[11], - 'b4096':value.latency[12], - 'b8192':value.latency[13], - 'b16384':value.latency[14], - 'b32768':value.latency[15] - }) - if (key[1] == "ping_pong.log"): - ping_pong_tracker.append({'Commit':key[0][0:7], - 'latency':value.latency[0] - }) - - # check to see if we have moved to a new commit - # if we have, store the dict in the amo_tracker - if (key[0] != prev_commit): - amo_dict['Commit'] = prev_commit[0:7] - amo_tracker.append(amo_dict.copy()) - amo_dict.clear() - - prev_commit = key[0] - - if (key[1] == "amo_add.log"): - self.check_and_add_to_dict(amo_dict, 'add', value.latency) - if (key[1] == "amo_cswap.log"): - self.check_and_add_to_dict(amo_dict, 'cswap', value.latency) - if (key[1] == "amo_fadd.log"): - self.check_and_add_to_dict(amo_dict, 'fadd', value.latency) - if (key[1] == "amo_fcswap.log"): - self.check_and_add_to_dict(amo_dict, 'fcswap', value.latency) - if (key[1] == "amo_fetch.log"): - self.check_and_add_to_dict(amo_dict, 'fetch', value.latency) - if (key[1] == "amo_finc.log"): - self.check_and_add_to_dict(amo_dict, 'finc', value.latency) - if (key[1] == "amo_inc.log"): - self.check_and_add_to_dict(amo_dict, 'inc', value.latency) - - # store the last commit's amo data - amo_dict['Commit'] = prev_commit[0:7] - amo_tracker.append(amo_dict.copy()) - - # write put results into a file: - size_field_names= ['Commit','b1','b2','b4','b8','b16','b32','b64','b128','b256','b512','b1024','b2048','b4096','b8192','b16384','b32768'] - amo_field_names= ['Commit','add','cswap','fadd','fcswap','fetch','finc','inc'] - ping_pong_field_names= ['Commit','latency'] - - self.write_dict_to_file(put_tracker, size_field_names, "put.csv") - self.write_dict_to_file(put_nbi_tracker, size_field_names, "put_nbi.csv") - self.write_dict_to_file(get_tracker, size_field_names, "get.csv") - self.write_dict_to_file(get_nbi_tracker, size_field_names, "get_nbi.csv") - self.write_dict_to_file(amo_tracker, amo_field_names, "amo.csv") - self.write_dict_to_file(ping_pong_tracker, ping_pong_field_names, "ping_pong.csv") - - # make a directory and execute the R script to generate plots in that directory - current_dir = os.getcwd() - plot_dir = os.path.join(current_dir, 'plots') - if not os.path.exists(plot_dir): - os.makedirs(plot_dir) - - changeset_a = list(self._log_tracker._data.keys())[0][0] - changeset_b = list(self._log_tracker._data.keys())[-1][0] - - # check if the provided changesets are correct - if (self._args.changeset_range): - found_changeset_a = False - found_changeset_b = False - for key, value in self._log_tracker._data.items(): - if (found_changeset_a and found_changeset_b): - break - if (not found_changeset_a): - if (self._args.changeset_range[0] == key[0]): - found_changeset_a = True - if (not found_changeset_b): - if (self._args.changeset_range[1] == key[0]): - found_changeset_b = True - - if ((not found_changeset_a) and (not found_changeset_b)): - sys.exit("One of the specified changesets was not found. Please specify correct/complete commit IDs.") - else: - changeset_a = self._args.changeset_range[0] - changeset_b = self._args.changeset_range[1] - - r_command = "Rscript ./plotter.R -o ./plots -a " + changeset_a[0:7] + " -b " + changeset_b[0:7] - - print(r_command) - subprocess.check_call(r_command, shell=True) - - - def one_changeset_plot(self): - found_changeset = 0 - non_amo_tracker = [] - amo_tracker = [] - ping_pong_tracker = [] - for key, value in self._log_tracker._data.items(): - if (key[0] == self._args.one_changeset): - found_changeset = 1 - if (key[1] == "put.log"): - put_vals = value.latency - if (key[1] == "put_nbi.log"): - put_nbi_vals = value.latency - if (key[1] == "get.log"): - get_vals = value.latency - if (key[1] == "get_nbi.log"): - get_nbi_vals = value.latency - if (key[1] == "amo_add.log"): - amo_tracker.append({'op':'add', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "amo_add.log"): - amo_tracker.append({'op':'add', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "amo_cswap.log"): - amo_tracker.append({'op':'cswap', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "amo_fadd.log"): - amo_tracker.append({'op':'fadd', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "amo_fcswap.log"): - amo_tracker.append({'op':'fcswap', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "amo_fetch.log"): - amo_tracker.append({'op':'fetch', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "amo_finc.log"): - amo_tracker.append({'op':'finc', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "amo_inc.log"): - amo_tracker.append({'op':'inc', - 'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - if (key[1] == "ping_pong.log"): - ping_pong_tracker.append({'latency': value.latency[0] if len(value.latency) > 0 else 0 - }) - - - if (not found_changeset): - sys.exit("The requested changeset was not found. Please specify correct/complete commit IDs.") - - index = 0 - for size in [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]: - non_amo_tracker.append({'size':size, - 'put':put_vals[index], - 'put_nbi':put_nbi_vals[index], - 'get':get_vals[index], - 'get_nbi':get_nbi_vals[index] - }) - index = index + 1 - - # write results into a file: - non_amo_field_names= ['size','put','put_nbi','get','get_nbi'] - amo_field_names= ['op','latency'] - ping_pong_field_names= ['latency'] - - self.write_dict_to_file(non_amo_tracker, non_amo_field_names, "non_amo_one_changeset.csv") - self.write_dict_to_file(amo_tracker, amo_field_names, "amo_one_changeset.csv") - self.write_dict_to_file(ping_pong_tracker, ping_pong_field_names, "ping_pong_one_changeset.csv") - - # call the R script with an option that tells it to plot figures for - r_command = "Rscript ./plotter.R -o ./plots -c " + self._args.one_changeset - - print(r_command) - subprocess.check_call(r_command, shell=True) - diff --git a/projects/rocshmem/internal/continuous_integration/short/report.py b/projects/rocshmem/internal/continuous_integration/short/report.py deleted file mode 100644 index bca823b490..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/report.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import sys - -class Report(): - def __init__(self, identifier, path, filename): - self._identifier = identifier - self._path = path - self._filename = filename - - def open(self): - print('opening report for ' + self._identifier) - try: - report_path = self._path + '/' + self._filename - print('report_path: ' + report_path) - self._file_handle = open(report_path, 'w') - except: - sys.exit('failed to open report: ' + report_path) - - def record(self, message): - self._file_handle.write(message + '\n') diff --git a/projects/rocshmem/internal/continuous_integration/short/short_config.xml b/projects/rocshmem/internal/continuous_integration/short/short_config.xml deleted file mode 100644 index bdb369c62d..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/short_config.xml +++ /dev/null @@ -1,96 +0,0 @@ - - - - - - - - - - - - - false - - - - - - - - - PLAIN - rsch/ec/shmem - - - PLAIN - amd-master - - - false - - - - - false - false - false - false - false - - false - false - - false - true - PLAIN - shmem_perf_check, - BASE64 - PLAIN - BASE64 - - - - - - - - - amd-gerrit - - - !SHORT - - - false - - - - - - - - - 2 - - - origin - ${GERRIT_REFSPEC} - ssh://gerritgit/rsch/ec/shmem - - - - - FETCH_HEAD - - - false - - - - internal/continuous_integration/short/Jenkinsfile - false - - - false - \ No newline at end of file diff --git a/projects/rocshmem/internal/continuous_integration/short/violation.py b/projects/rocshmem/internal/continuous_integration/short/violation.py deleted file mode 100644 index 4f7ece2de9..0000000000 --- a/projects/rocshmem/internal/continuous_integration/short/violation.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/tool/pandora64/.package/python-3.8.0/bin/python3 - -import pprint -import report -import sys - -class Threshold(): - def __init__(self, maximum_threshold, violation_type): - self._violations = {} - self._maximum_threshold = maximum_threshold - self._violation_type = violation_type - - def check(self, value, changeset, filename): - if value >= self._maximum_threshold: - key = changeset + '|' + filename + '|' + self._violation_type - self._violations[key] = value - print(key + ': ' + str(value) + '%') - - def provide_violations_to_report(self, report): - if self.has_violations(): - report.record('FAILURE') - report.record(self.dump()) - sys.exit(1) - else: - report.record('SUCCESS') - sys.exit(0) - - def has_violations(self): - return bool(self._violations) - - def dump(self): - str_out = pprint.pformat(self._violations, width=120) - str_out += '\n' - return str_out diff --git a/projects/rocshmem/internal/continuous_integration/smoke/Jenkinsfile b/projects/rocshmem/internal/continuous_integration/smoke/Jenkinsfile deleted file mode 100644 index 592f2516de..0000000000 --- a/projects/rocshmem/internal/continuous_integration/smoke/Jenkinsfile +++ /dev/null @@ -1,151 +0,0 @@ -pipeline { - agent { label 'sv-pdp-5' } - environment { - HSA_FORCE_FINE_GRAIN_PCIE = 1 - MPI_HOME="/home/resperf/mpich-4.0.1/install/global" - PATH = "$MPI_HOME/bin:$PATH" - LD_LIBRARY_PATH = "$MPI_HOME/lib:$LD_LIBRARY_PATH" - build_dir = "builds/change-${GERRIT_CHANGE_NUMBER}-${GERRIT_PATCHSET_NUMBER}" - CMAKE_PREFIX_PATH = "/opt/rocm/lib/cmake" - } - stages { - stage('Synchronize Source Code') { - steps { - checkout changelog: false, poll: false, scm: [$class: 'GitSCM', branches: [[name: 'FETCH_HEAD']], doGenerateSubmoduleConfigurations: false, extensions: [[$class: 'CloneOption', depth: 0, noTags: false, reference: '', shallow: false]], submoduleCfg: [], userRemoteConfigs: [[name: 'origin', refspec: '${GERRIT_REFSPEC}', url: 'ssh://gerritgit/rsch/ec/shmem']]] - } - } - stage('Make Build Directory') { - steps { - dir("library") { - sh "mkdir -p ${build_dir}" - } - } - } - stage('Build Source Code') { - parallel { - stage('RC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_SINGLE/install' - } - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI_WF_COAL") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi_wf_coal install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI_WF_COAL") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI_WF_COAL/install' - } - } - } - - stage('RC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/RC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/rc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/RC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/RC_MULTI/install' - } - } - } - - stage('DC_SINGLE') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_SINGLE") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_single install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_SINGLE") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_SINGLE/install' - } - } - } - - stage('DC_MULTI') { - steps { - //===================== LIBRARY ======================= - dir("library/${build_dir}/DC_MULTI") { - sh 'mkdir -p install' - sh '../../../build_configs/dc_multi install' - } - //===================== CLIENT ======================== - dir("clients/functional_tests/${build_dir}/DC_MULTI") { - sh '../../../build_configs/release ${WORKSPACE}/library/${build_dir}/DC_MULTI/install' - } - } - } - } - } - stage('Run Tests') { - parallel { - stage('RC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/RC_SINGLE' - } - } - } - - stage('RC_MULTI_WF_COAL') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI_WF_COAL/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI_WF_COAL' - } - } - } - - stage('RC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/RC_MULTI' - } - } - } - - stage('DC_SINGLE') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_SINGLE/rocshmem_example_driver single_thread ${build_dir}/DC_SINGLE' - } - } - } - - stage('DC_MULTI') { - steps { - dir("clients/functional_tests") { - sh './driver.sh ${build_dir}/DC_MULTI/rocshmem_example_driver multi_thread ${build_dir}/DC_MULTI' - } - } - } - - stage('RO_NET_BASIC') { - // RO_NET controlled at runtime, no need for a new build. Use RC_MULTI - steps { - dir("clients/functional_tests") { - sh 'mkdir -p ${build_dir}/RO_NET_BASIC' - sh 'ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 UCX_TLS=rc ./driver.sh ${build_dir}/RC_MULTI/rocshmem_example_driver ro ${build_dir}/RO_NET_BASIC' - } - } - } - } - } - } -} diff --git a/projects/rocshmem/internal/continuous_integration/smoke/smoke_config.xml b/projects/rocshmem/internal/continuous_integration/smoke/smoke_config.xml deleted file mode 100644 index cdf8981d88..0000000000 --- a/projects/rocshmem/internal/continuous_integration/smoke/smoke_config.xml +++ /dev/null @@ -1,85 +0,0 @@ - - - - - false - - - - - - - - PLAIN - rsch/ec/shmem - - - PLAIN - amd-master - - - false - - - - - true - true - true - true - true - - false - false - - false - true - PLAIN - - BASE64 - PLAIN - BASE64 - - - - - - - - - amd-gerrit - - - !SMOKE - - - false - - - - - - - - - 2 - - - ssh://gerritgit/rsch/ec/shmem - - - - - FETCH_HEAD - - - false - - - - internal/continuous_integration/smoke/Jenkinsfile - false - - - false - \ No newline at end of file diff --git a/projects/rocshmem/internal/scripts/cscope-index.py b/projects/rocshmem/internal/scripts/cscope-index.py deleted file mode 100755 index 04e8698035..0000000000 --- a/projects/rocshmem/internal/scripts/cscope-index.py +++ /dev/null @@ -1,47 +0,0 @@ -#! /usr/bin/python -# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -import os - -suffixes = [ '.cpp', '.hpp', '.c', '.h' ] -directories = [ 'src', 'include' ] - -def oksuffix(f): - for s in suffixes: - if f.endswith(s): - return True - return False - -def try_index_dir(directory): - for dirpath,subdirs,files in os.walk(os.path.join(cwd, directory)): - okfiles = [f for f in files if oksuffix(f)] - if okfiles: - print >> file_list, \ - '\n'.join([os.path.join(dirpath, f) for f in okfiles]) - - -file_list = file('cscope.files', 'w') -cwd = os.getcwd() -for d in directories: - try_index_dir(d) -file_list.close() - -os.system("cscope -b") diff --git a/projects/rocshmem/internal/workloads/Makefile b/projects/rocshmem/internal/workloads/Makefile deleted file mode 100644 index 70dd89b582..0000000000 --- a/projects/rocshmem/internal/workloads/Makefile +++ /dev/null @@ -1,105 +0,0 @@ -HIPCC=hipcc -BUILD=./build -SRC=./src -RESULTS=./results - -#rocshmem_DIR=${HOME}/rocshmem -#MPI_HOME=${HOME}/mpich/install -NCCL_HOME=${HOME}/rccl/build - -MPI_FLAGS=-lmpi -lhsa-runtime64 -lrt -L${MPI_HOME}/lib -fgpu-rdc -SHMEM_FLAGS=${MPI_FLAGS} -lmlx5 -libverbs -RCCL_FLAGS=${MPI_FLAGS} -Wl,-rpath,$(NCCL_HOME) -L${NCCL_HOME} -lrccl - -.SILENT: run_scan extract_scan run_sort run_sort_shmem run_sort_rccl extract_sort - -all: ${BUILD}/sort_shmem ${BUILD}/sort_rccl ${BUILD}/sort_mpi - -${BUILD}/sort_shmem: ${BUILD}/sort_shmem.o ${rocshmem_DIR}/lib/librocshmem.a - ${HIPCC} $^ ${SHMEM_FLAGS} -o $@ - -${BUILD}/sort_shmem.o: ${SRC}/sort_shmem.cu - ${HIPCC} $^ -I${rocshmem_DIR}/include -I${MPI_HOME}/include -fgpu-rdc -o $@ -c - -${BUILD}/sort_rccl: ${BUILD}/sort_rccl.o - ${HIPCC} $^ ${RCCL_FLAGS} -o $@ - -${BUILD}/sort_rccl.o: ${SRC}/sort_rccl.cu - ${HIPCC} $^ -I$(NCCL_HOME)/include/rccl -I${MPI_HOME}/include -fgpu-rdc -o $@ -c - -${BUILD}/sort_mpi: ${BUILD}/sort_mpi.o - ${HIPCC} $^ ${MPI_FLAGS} -o $@ - -${BUILD}/sort_mpi.o: ${SRC}/sort_mpi.cu - ${HIPCC} $^ -I${MPI_HOME}/include -fgpu-rdc -o $@ -c - -RO_FLAGS=ROCSHMEM_RO=1 RO_NET_CPU_QUEUE=1 -ITERS?=0 1 2 3 4 5 6 7 8 9 -TIMEOUT=1m -HOSTS=sv-pdp-0,sv-pdp-1,sv-pdp-2,sv-pdp-3 -SCAN_SIZE=1024 -PES=2 4 8 12 16 -PES_RCCL=2 4 8 - -TYPE ?= Naive -LABEL ?= naive -PARAM ?= 0 -NUM_PES ?= 2 - -run_sort_shmem: ${BUILD}/sort_shmem - printf "${TYPE} ";\ - echo "" > ${RESULTS}/sort_${LABEL}_${NUM_PES}.out; \ - for j in ${ITERS}; do \ - ${RO_FLAGS} timeout ${TIMEOUT} mpirun -np ${NUM_PES} -hosts ${HOSTS} ${BUILD}/sort_shmem ${PARAM} >> ${RESULTS}/sort_${LABEL}_${NUM_PES}.out;\ - done; - -run_sort_rccl: ${BUILD}/sort_rccl - printf "RCCL "; \ - echo "" > ${RESULTS}/sort_rccl_${NUM_PES}.out; \ - for j in ${ITERS}; do \ - timeout ${TIMEOUT} mpirun -np ${NUM_PES} -hosts ${HOSTS} ${BUILD}/sort_rccl >> ${RESULTS}/sort_rccl_${NUM_PES}.out;\ - done; - -run_sort_mpi: ${BUILD}/sort_rccl - printf "MPI2 "; \ - echo "" > ${RESULTS}/sort_mpi2_${NUM_PES}.out; \ - for j in ${ITERS}; do \ - timeout ${TIMEOUT} mpirun -np ${NUM_PES} -hosts ${HOSTS} ${BUILD}/sort_mpi >> ${RESULTS}/sort_mpi2_${NUM_PES}.out;\ - done; - -run_sort: ${BUILD}/sort_shmem ${BUILD}/sort_rccl - for i in ${PES}; do \ - printf "%d " $$i; \ - $(MAKE) --no-print-directory run_sort_shmem TYPE=NAIVE LABEL=naive PARAM=0 NUM_PES=$${i}; \ - $(MAKE) --no-print-directory run_sort_shmem TYPE=MPI LABEL=mpi PARAM=1 NUM_PES=$${i}; \ - $(MAKE) --no-print-directory run_sort_shmem TYPE=GCEN LABEL=gcen PARAM=2 NUM_PES=$${i}; \ - $(MAKE) --no-print-directory run_sort_shmem TYPE=GCEN2 LABEL=gcen2 PARAM=3 NUM_PES=$${i}; \ - $(MAKE) --no-print-directory run_sort_mpi NUM_PES=$${i}; \ - printf "\n";\ - done - for i in ${PES_RCCL}; do \ - $(MAKE) --no-print-directory run_sort_rccl NUM_PES=$${i}; \ - printf "%d " $$i; \ - done - - $(MAKE) extract_sort - - -extract_sort: - printf "Sort latency\n" - printf "PROCS\tType\tRuns" - for i in ${PES}; do \ - for type in mpi mpi2 rccl naive gcen gcen2; do\ - printf "\n%d\t$${type}\t" $$i; \ - file=${RESULTS}/sort_$${type}_$${i}.out;\ - latency=$$(grep -E "Avg time" $${file}); \ - grep -E "Avg time" $${file} | while read -r j; do\ - val=$$(echo $$j | grep -oE -m1 "[0-9]+\.[0-9]+");\ - printf "%s\t" $${val};\ - done; \ - done;\ - done - printf "\n" - -clean: - rm build/*; diff --git a/projects/rocshmem/internal/workloads/src/common.h b/projects/rocshmem/internal/workloads/src/common.h deleted file mode 100644 index a3f109103d..0000000000 --- a/projects/rocshmem/internal/workloads/src/common.h +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include -#include -using namespace std; - -#define TIME_NOW std::chrono::steady_clock::now() -#define TIME_DIFF(a, b) std::chrono::duration_cast(a - b).count() - -#define HIPCHECK(cmd) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ - printf("Failed: Hip error %s:%d '%s'\n", \ - __FILE__,__LINE__,hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - - -#define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -// Copied from rccl-tests, used to hash hostname -static uint64_t getHash(const char* string, size_t n) { - // Based on DJB2a, result = result * 33 ^ char - uint64_t result = 5381; - for (size_t c = 0; c < n; c++) { - result = ((result << 5) + result) ^ string[c]; - } - return result; -} - -/* Generate a hash of the unique identifying string for this host - * that will be unique for both bare-metal and container instances - * Equivalent of a hash of; - * - * $(hostname)$(cat /proc/sys/kernel/random/boot_id) - * - */ -#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" -static uint64_t getHostHash(const char* hostname) { - char hostHash[1024]; - - // Fall back is the hostname if something fails - (void) strncpy(hostHash, hostname, sizeof(hostHash)); - int offset = strlen(hostHash); - - FILE *file = fopen(HOSTID_FILE, "r"); - if (file != NULL) { - char *p; - if (fscanf(file, "%ms", &p) == 1) { - strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); - free(p); - } - } - fclose(file); - - // Make sure the string is terminated - hostHash[sizeof(hostHash)-1]='\0'; - - return getHash(hostHash, strlen(hostHash)); -} diff --git a/projects/rocshmem/internal/workloads/src/sort.h b/projects/rocshmem/internal/workloads/src/sort.h deleted file mode 100644 index 482ba7ee2d..0000000000 --- a/projects/rocshmem/internal/workloads/src/sort.h +++ /dev/null @@ -1,231 +0,0 @@ -/************************************************************************* - * * - * N A S P A R A L L E L B E N C H M A R K S 3.3 * - * * - * I S * - * * - ************************************************************************* - * * - * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. * - * It is described in NAS Technical Report 95-020. * - * * - * Permission to use, copy, distribute and modify this software * - * for any purpose with or without fee is hereby granted. We * - * request, however, that all derived work reference the NAS * - * Parallel Benchmarks 3.3. This software is provided "as is" * - * without express or implied warranty. * - * * - * Information on NPB 3.3, including the technical report, the * - * original specifications, source code, results and information * - * on how to submit new results, is available at: * - * * - * http://www.nas.nasa.gov/Software/NPB * - * * - * Send comments or suggestions to npb@nas.nasa.gov * - * Send bug reports to npb-bugs@nas.nasa.gov * - * * - * NAS Parallel Benchmarks Group * - * NASA Ames Research Center * - * Mail Stop: T27A-1 * - * Moffett Field, CA 94035-1000 * - * * - * E-mail: npb@nas.nasa.gov * - * Fax: (650) 604-3957 * - * * - ************************************************************************* - * * - * Author: M. Yarrow * - * H. Jin * - * * - *************************************************************************/ - -#define NUM_WGS 1 -#define WG_SIZE 1024 -#define MAX_PES 128 - -#define MAX_KEY (1 << 11) - -/* - * FUNCTION RANDLC (X, A) - * - * This routine returns a uniform pseudorandom double precision number in the - * range (0, 1) by using the linear congruential generator - * - * x_{k+1} = a x_k (mod 2^46) - * - * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers - * before repeating. The argument A is the same as 'a' in the above formula, - * and X is the same as x_0. A and X must be odd double precision integers - * in the range (1, 2^46). The returned value RANDLC is normalized to be - * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain - * the new seed x_1, so that subsequent calls to RANDLC using the same - * arguments will generate a continuous sequence. - * - * This routine should produce the same results on any computer with at least - * 48 mantissa bits in double precision floating point data. On Cray systems, - * double precision should be disabled. - * - * David H. Bailey October 26, 1990 - * - * IMPLICIT DOUBLE PRECISION (A-H, O-Z) - * SAVE KS, R23, R46, T23, T46 - * DATA KS/0/ - * - * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, - * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than - * by merely using the ** operator, in order to insure that the results are - * exact on all systems. This code assumes that 0.5D0 is represented exactly. - */ - - - -/*****************************************************************/ -/************* R A N D L C ************/ -/************* ************/ -/************* portable random number generator ************/ -/*****************************************************************/ - -double randlc( double *X, double *A ) -{ - static int KS=0; - static double R23, R46, T23, T46; - double T1, T2, T3, T4; - double A1; - double A2; - double X1; - double X2; - double Z; - int i, j; - - if (KS == 0) - { - R23 = 1.0; - R46 = 1.0; - T23 = 1.0; - T46 = 1.0; - - for (i=1; i<=23; i++) - { - R23 = 0.50 * R23; - T23 = 2.0 * T23; - } - for (i=1; i<=46; i++) - { - R46 = 0.50 * R46; - T46 = 2.0 * T46; - } - KS = 1; - } - -/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ - - T1 = R23 * *A; - j = T1; - A1 = j; - A2 = *A - T23 * A1; - -/* Break X into two parts such that X = 2^23 * X1 + X2, compute - Z = A1 * X2 + A2 * X1 (mod 2^23), and then - X = 2^23 * Z + A2 * X2 (mod 2^46). */ - - T1 = R23 * *X; - j = T1; - X1 = j; - X2 = *X - T23 * X1; - T1 = A1 * X2 + A2 * X1; - - j = R23 * T1; - T2 = j; - Z = T1 - T23 * T2; - T3 = T23 * Z + A2 * X2; - j = R46 * T3; - T4 = j; - *X = T3 - T46 * T4; - return(R46 * *X); -} - - - -/*****************************************************************/ -/************ F I N D _ M Y _ S E E D ************/ -/************ ************/ -/************ returns parallel random number seq seed ************/ -/*****************************************************************/ - -/* - * Create a random number sequence of total length nn residing - * on np number of processors. Each processor will therefore have a - * subsequence of length nn/np. This routine returns that random - * number which is the first random number for the subsequence belonging - * to processor rank kn, and which is used as seed for proc kn ran # gen. - */ - -double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */ - int np, /* np = num procs */ - long nn, /* total num of ran numbers, all procs */ - double s, /* Ran num seed, for ex.: 314159265.00 */ - double a ) /* Ran num gen mult, try 1220703125.00 */ -{ - - long i; - - double t1,t2,t3,an; - long mq,nq,kk,ik; - - - - nq = nn / np; - - for( mq=0; nq>1; mq++,nq/=2 ) - ; - - t1 = a; - - for( i=1; i<=mq; i++ ) - t2 = randlc( &t1, &t1 ); - - an = t1; - - kk = kn; - t1 = s; - t2 = an; - - for( i=1; i<=100; i++ ) - { - ik = kk / 2; - if( 2 * ik != kk ) - t3 = randlc( &t1, &t2 ); - if( ik == 0 ) - break; - t3 = randlc( &t2, &t2 ); - kk = ik; - } - - return( t1 ); - -} - - - - -/*****************************************************************/ -/************* C R E A T E _ S E Q ************/ -/*****************************************************************/ - -void create_seq( double seed, double a, int *key_array, int size ) -{ - double x; - int i, k; - - k = MAX_KEY/4; - - for (i=0; i < size; i++) - { - x = randlc(&seed, &a); - x += randlc(&seed, &a); - x += randlc(&seed, &a); - x += randlc(&seed, &a); - - key_array[i] = k*x; - } -} \ No newline at end of file diff --git a/projects/rocshmem/internal/workloads/src/sort_mpi.cu b/projects/rocshmem/internal/workloads/src/sort_mpi.cu deleted file mode 100644 index 3907c6aefd..0000000000 --- a/projects/rocshmem/internal/workloads/src/sort_mpi.cu +++ /dev/null @@ -1,380 +0,0 @@ -#include "mpi.h" -#include "common.h" -#include "sort.h" - -//#define TIME_PERF -#ifdef TIME_PERF -#define TIMERS 10 -__device__ uint64_t timers[TIMERS] = {0}; -__device__ uint64_t time_start; -#define TIMERS_START() \ - if(threadIdx.x == 0) {\ - time_start = rocshmem_timer();\ - } - -#define TIME(TIMER_NUM) \ - if(threadIdx.x == 0) {\ - timers[TIMER_NUM] = rocshmem_timer() - time_start;\ - time_start = rocshmem_timer();\ - } - -#define OUTPUT_TIME() \ - if(threadIdx.x == 0 && my_pe == 0) { \ - uint64_t sum = 0; \ - for(int i = 0; i < TIMERS; ++i) { \ - sum += timers[i]; \ - } \ - for(int i = 0; i < TIMERS; ++i) { \ - printf("%d: %f\n", i, (double)timers[i] / (double)sum); \ - } \ - } -#else -#define TIMERS_START() -#define TIME(x) -#define OUTPUT_TIME() -#endif - -__global__ void sort1(volatile int *keys, int *keyBuffer1, - int *keyBuffer2, int *sendCount, - int *recvCount, int *sendOffset, - int *recvOffset, int *outputKeys, - size_t size, int n_pes, int my_pe) { - __shared__ int bucketCounter[MAX_PES]; - __shared__ int bucketPtr[MAX_PES]; - __shared__ int total_size; - - int buckets = n_pes; - - int tid = threadIdx.x; // + blockDim.x * blockIdx.x; - const int K_PER_BUCK = (MAX_KEY / buckets); - - // Reset - for(int i = threadIdx.x; i < buckets; i += blockDim.x) { - bucketCounter[i] = 0; - bucketPtr[i] = 0; - } - __syncthreads(); - TIMERS_START() - // Count size of each bucket - for(int i = tid; i < size; i += blockDim.x) { - atomicAdd(&bucketCounter[keys[i] / K_PER_BUCK], 1); - } - __syncthreads(); - TIME(0) - // Update in global memory - for(int i = tid; i < buckets; i += blockDim.x) { - sendCount[i] = bucketPtr[i] = bucketCounter[i]; - } - __syncthreads(); - TIME(1) - // Perform local scan to get ptrs set - for(int shift = 1; shift < buckets; shift *= 2) { - int temp = 0; - if(threadIdx.x >= shift && threadIdx.x < buckets) { - temp = bucketPtr[threadIdx.x - shift]; - } - __syncthreads(); - if(threadIdx.x < buckets) { - bucketPtr[threadIdx.x] += temp; - } - __syncthreads(); - } - __syncthreads(); - TIME(2) - // Find offsets of where we're sending - for(int i = threadIdx.x; i < buckets; i += blockDim.x) { - sendOffset[i] = bucketPtr[i] - sendCount[i]; - } - // Sort keys into buckets - for(int i = threadIdx.x; i < size; i += blockDim.x) { - int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1; - keyBuffer1[loc] = keys[i]; - } - TIME(3) - OUTPUT_TIME() -} - -__global__ void sort2(volatile int *keys, int *keyBuffer1, - int *keyBuffer2, int *sendCount, - int *recvCount, int *sendOffset, - int *recvOffset, int *outputKeys, - size_t size, int n_pes, int my_pe) { - __shared__ int total_size; - - int buckets = n_pes; - - int tid = threadIdx.x; // + blockDim.x * blockIdx.x; - const int K_PER_BUCK = (MAX_KEY / buckets); - - for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x) - outputKeys[i] = 0; - __syncthreads(); - TIME(5) - int min_key_val = my_pe * K_PER_BUCK; - int max_key_val = (my_pe + 1) * K_PER_BUCK - 1; - - int *key_buff_ptr = outputKeys - min_key_val; - for(int i = threadIdx.x; i < total_size; i += blockDim.x) { - atomicAdd(&key_buff_ptr[keyBuffer2[i]], 1); - } - __syncthreads(); - TIME(6) - // Perform local scan on keys - for(int shift = 1; shift < K_PER_BUCK; shift *= 2) { - int temp = 0; - if(threadIdx.x >= shift && threadIdx.x < K_PER_BUCK) { - temp = outputKeys[threadIdx.x - shift]; - } - __syncthreads(); - if(threadIdx.x < K_PER_BUCK) { - outputKeys[threadIdx.x] += temp; - } - __syncthreads(); - } - TIME(7) - OUTPUT_TIME() -} - -void sort(volatile int *keys, int *keyBuffer1, - int *keyBuffer2, int *sendCount, - int *recvCount, int *sendOffset, - int *recvOffset, int *outputKeys, - size_t size, int max_iters) { - int nProcs, my_pe; - MPI_Comm_size(MPI_COMM_WORLD, &nProcs); - MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); - - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - for(int iter = 0; iter < max_iters; ++iter) { - //fprintf(stderr, "%d: %d %d %p %p\n", my_pe, iter, max_iters, sendCount, recvCount); - sort1<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1, - keyBuffer2, sendCount, recvCount, sendOffset, - recvOffset, outputKeys, size, nProcs, my_pe); - HIPCHECK(hipStreamSynchronize(stream)); - MPI_Alltoall(sendCount, 1, MPI_INT, recvCount, 1, - MPI_INT, MPI_COMM_WORLD); - MPI_Alltoall(sendOffset, 1, MPI_INT, recvOffset, 1, - MPI_INT, MPI_COMM_WORLD); - int total_size = 0; - MPI_Request *req = new MPI_Request[2 * nProcs]; - const int TAG = 10000; - for(int i = 0; i < nProcs; ++i) { - MPI_Isend(&keyBuffer1[sendOffset[i]], sendCount[i], - MPI_INT, i, TAG, MPI_COMM_WORLD, &req[2 * i]); - MPI_Irecv(&keyBuffer2[total_size], recvCount[i], - MPI_INT, i, TAG, MPI_COMM_WORLD, &req[2 * i + 1]); - total_size += recvCount[i]; - } - MPI_Waitall(2 * nProcs, req, MPI_STATUS_IGNORE); - sort2<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1, - keyBuffer2, sendCount, recvCount, sendOffset, - recvOffset, outputKeys, size, nProcs, my_pe); - } -} - -bool verify(int *outputKeys, int *keyBuffer2, size_t size) -{ - int num_pes, my_pe; - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); - - MPI_Status status; - MPI_Request request; - - int min_key_val = my_pe * (MAX_KEY / num_pes); - int max_key_val = (my_pe + 1) * (MAX_KEY / num_pes) - 1; - - int *key_array = new int[size]; - // Perform final untimed sort on keys - for(int i = 0; i < size; ++i) - if(outputKeys[keyBuffer2[i] - min_key_val] > 0) - key_array[--outputKeys[keyBuffer2[i] - min_key_val]] = keyBuffer2[i]; - else { - fprintf(stderr, "%d: Found wrong key %d at %d with %d\n", my_pe, keyBuffer2[i], i, outputKeys[keyBuffer2[i]]); - return false; - } - - if(size < 1) - size = 1; - - int k; - const int MPI_TAG = 1000; - // Check if largest key is smaller than next processor's - if(my_pe > 0) - MPI_Irecv(&k, 1, MPI_INT, my_pe - 1, MPI_TAG, MPI_COMM_WORLD, - &request); - if(my_pe < num_pes - 1) - MPI_Send(&key_array[size - 1], 1, MPI_INT, my_pe + 1, MPI_TAG, - MPI_COMM_WORLD ); - if(my_pe > 0) - MPI_Wait(&request, &status); - - // Check if it is smaller - int j = 0; - if( my_pe > 0 && size > 1 ) - if( k > key_array[0] ) - j++; - - // Check if keys correctly sorted - for(int i = 1; i < size; i++) - if(key_array[i - 1] > key_array[i]) - j++; - - delete[] key_array; - - if(j != 0) { - fprintf(stderr, "Processor %d: Full_verify: number of keys out of sort: %d\n", - my_pe, j ); - return false; - } - return true; -} - -void initGPU() -{ - // Calculation for local rank, taken from rccl-tests - int localRank = 0; - int nProcs, proc; - MPI_Comm_size(MPI_COMM_WORLD, &nProcs); - MPI_Comm_rank(MPI_COMM_WORLD, &proc); - char hostname[1024]; - gethostname(hostname, 1024); - for (int i=0; i< 1024; i++) { - if (hostname[i] == '.') { - hostname[i] = '\0'; - break; - } - } - uint64_t hostHashs[nProcs]; - hostHashs[proc] = getHostHash(hostname); - MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); - for (int p=0; p 1) - iterations = atoi(argv[1]); - - int num_pes, my_pe; - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); - - // Configure input and outputs - size_t size = 1024; //atoi(argv[1]); - int *keys, *outputKeys; - hipMalloc((void**)&keys, sizeof(int) * size); - hipMalloc((void**)&outputKeys, sizeof(int) * WG_SIZE); - -/* Generate random number sequence and subsequent keys on all procs */ - create_seq( find_my_seed( my_pe, - num_pes, - 4*(long)size*num_pes, - 314159265.00, /* Random number gen seed */ - 1220703125.00 ), /* Random number gen mult */ - 1220703125.00, keys, size ); /* Random number gen mult */ - - - // Init buffers - int *keyBuffer1, *keyBuffer2; - keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size); - keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4); - - int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0; - sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - - printf("Begin untimed run\n"); - // Untimed run - MPI_Barrier(MPI_COMM_WORLD); - sort((int*)keys, keyBuffer1, keyBuffer2, - sendCount, recvCount, sendOffset, recvOffset, - outputKeys, size, 1); - hipDeviceSynchronize(); - - printf("Verify untimed run\n"); - // Verify correctness - if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) { - fprintf(stderr, "Wrong output\n"); - return -1; - } - - printf("Begin timed run\n"); - // Timed run - MPI_Barrier(MPI_COMM_WORLD); - auto time_start = TIME_NOW; - sort((int*)keys, keyBuffer1, keyBuffer2, - sendCount, recvCount, sendOffset, recvOffset, - outputKeys, size, iterations); - hipDeviceSynchronize(); - double tot_time = (double)TIME_DIFF(TIME_NOW, time_start); - - double all_time = 0; - MPI_Allreduce(&tot_time, &all_time, 1, - MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - if(my_pe == 0) { - printf("Avg time:\t%.3f\tus\n", all_time / (double)(1000.0 * iterations * num_pes)); - } - - // Verify correctness - if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) { - fprintf(stderr, "Wrong output\n"); - return -1; - } - - fprintf(stderr, "Done verify for %d\n", my_pe); - - // Clean up - hipFree(keys); - hipFree(outputKeys); - rocshmem_free(keyBuffer1); - rocshmem_free(keyBuffer2); - rocshmem_free(sendCount); - rocshmem_free(recvCount); - rocshmem_free(sendOffset); - rocshmem_free(recvOffset); - MPI_Finalize(); - return 0; -} diff --git a/projects/rocshmem/internal/workloads/src/sort_rccl.cu b/projects/rocshmem/internal/workloads/src/sort_rccl.cu deleted file mode 100644 index b9e0f12536..0000000000 --- a/projects/rocshmem/internal/workloads/src/sort_rccl.cu +++ /dev/null @@ -1,394 +0,0 @@ -#include "rccl.h" -#include "common.h" -#include "sort.h" - -//#define TIME_PERF -#ifdef TIME_PERF -#define TIMERS 10 -__device__ uint64_t timers[TIMERS] = {0}; -__device__ uint64_t time_start; -#define TIMERS_START() \ - if(threadIdx.x == 0) {\ - time_start = rocshmem_timer();\ - } - -#define TIME(TIMER_NUM) \ - if(threadIdx.x == 0) {\ - timers[TIMER_NUM] = rocshmem_timer() - time_start;\ - time_start = rocshmem_timer();\ - } - -#define OUTPUT_TIME() \ - if(threadIdx.x == 0 && my_pe == 0) { \ - uint64_t sum = 0; \ - for(int i = 0; i < TIMERS; ++i) { \ - sum += timers[i]; \ - } \ - for(int i = 0; i < TIMERS; ++i) { \ - printf("%d: %f\n", i, (double)timers[i] / (double)sum); \ - } \ - } -#else -#define TIMERS_START() -#define TIME(x) -#define OUTPUT_TIME() -#endif - -__global__ void sort1(volatile int *keys, int *keyBuffer1, - int *keyBuffer2, int *sendCount, - int *recvCount, int *sendOffset, - int *recvOffset, int *outputKeys, - size_t size, int n_pes, int my_pe) { - __shared__ int bucketCounter[MAX_PES]; - __shared__ int bucketPtr[MAX_PES]; - __shared__ int total_size; - - int buckets = n_pes; - - int tid = threadIdx.x; // + blockDim.x * blockIdx.x; - const int K_PER_BUCK = (MAX_KEY / buckets); - - // Reset - for(int i = threadIdx.x; i < buckets; i += blockDim.x) { - bucketCounter[i] = 0; - bucketPtr[i] = 0; - } - __syncthreads(); - TIMERS_START() - // Count size of each bucket - for(int i = tid; i < size; i += blockDim.x) { - atomicAdd(&bucketCounter[keys[i] / K_PER_BUCK], 1); - } - __syncthreads(); - TIME(0) - // Update in global memory - for(int i = tid; i < buckets; i += blockDim.x) { - sendCount[i] = bucketPtr[i] = bucketCounter[i]; - } - __syncthreads(); - TIME(1) - // Perform local scan to get ptrs set - for(int shift = 1; shift < buckets; shift *= 2) { - int temp = 0; - if(threadIdx.x >= shift && threadIdx.x < buckets) { - temp = bucketPtr[threadIdx.x - shift]; - } - __syncthreads(); - if(threadIdx.x < buckets) { - bucketPtr[threadIdx.x] += temp; - } - __syncthreads(); - } - __syncthreads(); - TIME(2) - // Find offsets of where we're sending - for(int i = threadIdx.x; i < buckets; i += blockDim.x) { - sendOffset[i] = bucketPtr[i] - sendCount[i]; - } - // Sort keys into buckets - for(int i = threadIdx.x; i < size; i += blockDim.x) { - int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1; - keyBuffer1[loc] = keys[i]; - } - TIME(3) - OUTPUT_TIME() -} - -__global__ void sort2(volatile int *keys, int *keyBuffer1, - int *keyBuffer2, int *sendCount, - int *recvCount, int *sendOffset, - int *recvOffset, int *outputKeys, - size_t size, int n_pes, int my_pe) { - __shared__ int total_size; - - int buckets = n_pes; - - int tid = threadIdx.x; // + blockDim.x * blockIdx.x; - const int K_PER_BUCK = (MAX_KEY / buckets); - - for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x) - outputKeys[i] = 0; - __syncthreads(); - TIME(5) - int min_key_val = my_pe * K_PER_BUCK; - int max_key_val = (my_pe + 1) * K_PER_BUCK - 1; - - int *key_buff_ptr = outputKeys - min_key_val; - for(int i = threadIdx.x; i < total_size; i += blockDim.x) { - atomicAdd(&key_buff_ptr[keyBuffer2[i]], 1); - } - __syncthreads(); - TIME(6) - // Perform local scan on keys - for(int shift = 1; shift < K_PER_BUCK; shift *= 2) { - int temp = 0; - if(threadIdx.x >= shift && threadIdx.x < K_PER_BUCK) { - temp = outputKeys[threadIdx.x - shift]; - } - __syncthreads(); - if(threadIdx.x < K_PER_BUCK) { - outputKeys[threadIdx.x] += temp; - } - __syncthreads(); - } - TIME(7) - OUTPUT_TIME() -} - -void sort(volatile int *keys, int *keyBuffer1, - int *keyBuffer2, int *sendCount, - int *recvCount, int *sendOffset, - int *recvOffset, int *outputKeys, - size_t size, int max_iters, ncclComm_t comm) { - int nProcs, my_pe; - MPI_Comm_size(MPI_COMM_WORLD, &nProcs); - MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); - - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - - for(int iter = 0; iter < max_iters; ++iter) { - //fprintf(stderr, "%d: %d %d %p %p\n", my_pe, iter, max_iters, sendCount, recvCount); - sort1<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1, - keyBuffer2, sendCount, recvCount, sendOffset, - recvOffset, outputKeys, size, nProcs, my_pe); - NCCLCHECK(ncclAllToAll(sendCount, recvCount, 1, - ncclInt, comm, stream)); - NCCLCHECK(ncclAllToAll(sendOffset, recvOffset, 1, - ncclInt, comm, stream)); - HIPCHECK(hipStreamSynchronize(stream)); - NCCLCHECK(ncclGroupStart()); - int total_size = 0; - for(int i = 0; i < nProcs; ++i) { - ncclSend(&keyBuffer1[sendOffset[i]], sendCount[i], - ncclInt, i, comm, stream); - ncclRecv(&keyBuffer2[total_size], recvCount[i], - ncclInt, i, comm, stream); - total_size += recvCount[i]; - } - NCCLCHECK(ncclGroupEnd()); - HIPCHECK(hipStreamSynchronize(stream)); - sort2<<<1, WG_SIZE, 0, stream>>>(keys, keyBuffer1, - keyBuffer2, sendCount, recvCount, sendOffset, - recvOffset, outputKeys, size, nProcs, my_pe); - HIPCHECK(hipStreamSynchronize(stream)); - } -} - -bool verify(int *outputKeys, int *keyBuffer2, size_t size) -{ - int num_pes, my_pe; - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); - - MPI_Status status; - MPI_Request request; - - int min_key_val = my_pe * (MAX_KEY / num_pes); - int max_key_val = (my_pe + 1) * (MAX_KEY / num_pes) - 1; - - int *key_array = new int[size]; - // Perform final untimed sort on keys - for(int i = 0; i < size; ++i) - if(outputKeys[keyBuffer2[i] - min_key_val] > 0) - key_array[--outputKeys[keyBuffer2[i] - min_key_val]] = keyBuffer2[i]; - else { - fprintf(stderr, "%d: Found wrong key %d at %d with %d\n", my_pe, keyBuffer2[i], i, outputKeys[keyBuffer2[i]]); - return false; - } - - if(size < 1) - size = 1; - - int k; - const int MPI_TAG = 1000; - // Check if largest key is smaller than next processor's - if(my_pe > 0) - MPI_Irecv(&k, 1, MPI_INT, my_pe - 1, MPI_TAG, MPI_COMM_WORLD, - &request); - if(my_pe < num_pes - 1) - MPI_Send(&key_array[size - 1], 1, MPI_INT, my_pe + 1, MPI_TAG, - MPI_COMM_WORLD ); - if(my_pe > 0) - MPI_Wait(&request, &status); - - // Check if it is smaller - int j = 0; - if( my_pe > 0 && size > 1 ) - if( k > key_array[0] ) - j++; - - // Check if keys correctly sorted - for(int i = 1; i < size; i++) - if(key_array[i - 1] > key_array[i]) - j++; - - delete[] key_array; - - if(j != 0) { - fprintf(stderr, "Processor %d: Full_verify: number of keys out of sort: %d\n", - my_pe, j ); - return false; - } - return true; -} - -void initGPU(ncclComm_t &comms) -{ - // Calculation for local rank, taken from rccl-tests - int localRank = 0; - int nProcs, proc; - MPI_Comm_size(MPI_COMM_WORLD, &nProcs); - MPI_Comm_rank(MPI_COMM_WORLD, &proc); - char hostname[1024]; - gethostname(hostname, 1024); - for (int i=0; i< 1024; i++) { - if (hostname[i] == '.') { - hostname[i] = '\0'; - break; - } - } - uint64_t hostHashs[nProcs]; - hostHashs[proc] = getHostHash(hostname); - MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); - for (int p=0; p 1) - iterations = atoi(argv[1]); - - int num_pes, my_pe; - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); - - // Configure input and outputs - size_t size = 1024; //atoi(argv[1]); - int *keys, *outputKeys; - hipMalloc((void**)&keys, sizeof(int) * size); - hipMalloc((void**)&outputKeys, sizeof(int) * WG_SIZE); - -/* Generate random number sequence and subsequent keys on all procs */ - create_seq( find_my_seed( my_pe, - num_pes, - 4*(long)size*num_pes, - 314159265.00, /* Random number gen seed */ - 1220703125.00 ), /* Random number gen mult */ - 1220703125.00, keys, size ); /* Random number gen mult */ - - - // Init buffers - int *keyBuffer1, *keyBuffer2; - keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size); - keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4); - - int *sendCount = 0, *recvCount = 0, *sendOffset = 0, *recvOffset = 0; - sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - - printf("Begin untimed run\n"); - // Untimed run - MPI_Barrier(MPI_COMM_WORLD); - sort((int*)keys, keyBuffer1, keyBuffer2, - sendCount, recvCount, sendOffset, recvOffset, - outputKeys, size, 1, comms); - hipDeviceSynchronize(); - - printf("Verify untimed run\n"); - // Verify correctness - if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) { - fprintf(stderr, "Wrong output\n"); - return -1; - } - - printf("Begin timed run\n"); - // Timed run - MPI_Barrier(MPI_COMM_WORLD); - auto time_start = TIME_NOW; - sort((int*)keys, keyBuffer1, keyBuffer2, - sendCount, recvCount, sendOffset, recvOffset, - outputKeys, size, iterations, comms); - hipDeviceSynchronize(); - double tot_time = (double)TIME_DIFF(TIME_NOW, time_start); - - double all_time = 0; - MPI_Allreduce(&tot_time, &all_time, 1, - MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - if(my_pe == 0) { - printf("Avg time:\t%.3f\tus\n", all_time / (double)(1000.0 * iterations * num_pes)); - } - - // Verify correctness - if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) { - fprintf(stderr, "Wrong output\n"); - return -1; - } - - // Clean up - hipFree(keys); - hipFree(outputKeys); - rocshmem_free(keyBuffer1); - rocshmem_free(keyBuffer2); - rocshmem_free(sendCount); - rocshmem_free(recvCount); - rocshmem_free(sendOffset); - rocshmem_free(recvOffset); - ncclCommDestroy(comms); - MPI_Finalize(); - return 0; -} diff --git a/projects/rocshmem/internal/workloads/src/sort_shmem.cu b/projects/rocshmem/internal/workloads/src/sort_shmem.cu deleted file mode 100644 index a6f88c6ab4..0000000000 --- a/projects/rocshmem/internal/workloads/src/sort_shmem.cu +++ /dev/null @@ -1,358 +0,0 @@ -#include -#include -#include -#include -#include -using namespace std; -using namespace rocshmem; - -#include "common.h" -#include "sort.h" - -//#define TIME_PERF -#ifdef TIME_PERF -#define TIMERS 10 -__device__ uint64_t timers[TIMERS] = {0}; -__device__ uint64_t time_start; -#define TIMERS_START() \ - if(threadIdx.x == 0) {\ - time_start = rocshmem_timer();\ - } - -#define TIME(TIMER_NUM) \ - if(threadIdx.x == 0) {\ - timers[TIMER_NUM] = rocshmem_timer() - time_start;\ - time_start = rocshmem_timer();\ - } - -#define OUTPUT_TIME() \ - if(threadIdx.x == 0 && my_pe == 0) { \ - uint64_t sum = 0; \ - for(int i = 0; i < TIMERS; ++i) { \ - sum += timers[i]; \ - } \ - for(int i = 0; i < TIMERS; ++i) { \ - printf("%d: %f\n", i, (double)timers[i] / (double)sum); \ - } \ - } -#else -#define TIMERS_START() -#define TIME(x) -#define OUTPUT_TIME() -#endif - -__device__ __inline__ void alltoall(rocshmem_ctx_t &ctx, - rocshmem_team_t team, - int *dst, int *src) { - // Perform alltoall - rocshmem_ctx_int_wg_alltoall(ctx, - team, - dst, // T* dest - src, // const T* source - 1); // int nelement -} - -__global__ void sort(volatile int *keys, int *keyBuffer1, - int *keyBuffer2, int *sendCount, - int *recvCount, int *sendOffset, - int *recvOffset, int *outputKeys, - size_t size, rocshmem_team_t team, - int max_iters) { - __shared__ rocshmem_ctx_t ctx; - __shared__ int bucketCounter[MAX_PES]; - __shared__ int bucketPtr[MAX_PES]; - __shared__ int total_size; - - rocshmem_wg_init(); - rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx); - - int n_pes = rocshmem_ctx_n_pes(ctx); - int my_pe = rocshmem_my_pe(); - int buckets = n_pes; - - int tid = threadIdx.x; // + blockDim.x * blockIdx.x; - const int K_PER_BUCK = (MAX_KEY / buckets); - - for(int iter = 0; iter < max_iters; ++iter) { - // Reset - for(int i = threadIdx.x; i < buckets; i += blockDim.x) { - bucketCounter[i] = 0; - bucketPtr[i] = 0; - } - __syncthreads(); - TIMERS_START() - // Count size of each bucket - for(int i = tid; i < size; i += blockDim.x) { - atomicAdd(&bucketCounter[keys[i] / K_PER_BUCK], 1); - } - __syncthreads(); - TIME(0) - // Update in global memory - for(int i = tid; i < buckets; i += blockDim.x) { - sendCount[i] = bucketPtr[i] = bucketCounter[i]; - } - __syncthreads(); - TIME(1) - // Perform local scan to get ptrs set - for(int shift = 1; shift < buckets; shift *= 2) { - int temp = 0; - if(threadIdx.x >= shift && threadIdx.x < buckets) { - temp = bucketPtr[threadIdx.x - shift]; - } - __syncthreads(); - if(threadIdx.x < buckets) { - bucketPtr[threadIdx.x] += temp; - } - __syncthreads(); - } - __syncthreads(); - TIME(2) - // Find offsets of where we're sending - for(int i = threadIdx.x; i < buckets; i += blockDim.x) { - sendOffset[i] = bucketPtr[i] - sendCount[i]; - } - // Sort keys into buckets - for(int i = threadIdx.x; i < size; i += blockDim.x) { - int loc = atomicAdd(&bucketPtr[keys[i] / K_PER_BUCK], -1) - 1; - keyBuffer1[loc] = keys[i]; - } - rocshmem_ctx_threadfence_system(ctx); - // Force sync to wait for all PEs to update bucket sizes - rocshmem_ctx_wg_team_sync(ctx, team); - TIME(3) - // Let all PEs know how many keys you wish to send - alltoall(ctx, team, recvCount, sendCount); - // Let all PEs know where the offsets are of the keys - alltoall(ctx, team, recvOffset, sendOffset); - __syncthreads(); - TIME(4) - if(threadIdx.x == 0) { - total_size = 0; - for(int i = 0; i < buckets; ++i) { - rocshmem_int_get_nbi(&keyBuffer2[total_size], - &keyBuffer1[recvOffset[i]], recvCount[i], i); - total_size += recvCount[i]; - } - rocshmem_quiet(); - } - for(int i = threadIdx.x; i < K_PER_BUCK; i += blockDim.x) - outputKeys[i] = 0; - __syncthreads(); - TIME(5) - int min_key_val = my_pe * K_PER_BUCK; - int max_key_val = (my_pe + 1) * K_PER_BUCK - 1; - - int *key_buff_ptr = outputKeys - min_key_val; - for(int i = threadIdx.x; i < total_size; i += blockDim.x) { - atomicAdd(&key_buff_ptr[keyBuffer2[i]], 1); - } - __syncthreads(); - TIME(6) - // Perform local scan on keys - for(int shift = 1; shift < K_PER_BUCK; shift *= 2) { - int temp = 0; - if(threadIdx.x >= shift && threadIdx.x < K_PER_BUCK) { - temp = outputKeys[threadIdx.x - shift]; - } - __syncthreads(); - if(threadIdx.x < K_PER_BUCK) { - outputKeys[threadIdx.x] += temp; - } - __syncthreads(); - } - TIME(7) - } - OUTPUT_TIME() - rocshmem_wg_ctx_destroy(ctx); - rocshmem_wg_finalize(); -} - -bool verify(int *outputKeys, int *keyBuffer2, size_t size) -{ - int num_pes = rocshmem_n_pes(); - int my_pe = rocshmem_my_pe(); - - MPI_Status status; - MPI_Request request; - - int min_key_val = my_pe * (MAX_KEY / num_pes); - int max_key_val = (my_pe + 1) * (MAX_KEY / num_pes) - 1; - - int *key_array = new int[size]; - // Perform final untimed sort on keys - for(int i = 0; i < size; ++i) - if(outputKeys[keyBuffer2[i] - min_key_val] > 0) - key_array[--outputKeys[keyBuffer2[i] - min_key_val]] = keyBuffer2[i]; - else { - fprintf(stderr, "%d: Found wrong key %d at %d with %d\n", my_pe, keyBuffer2[i], i, outputKeys[keyBuffer2[i]]); - return false; - } - - if(size < 1) - size = 1; - - int k; - const int MPI_TAG = 1000; - // Check if largest key is smaller than next processor's - if(my_pe > 0) - MPI_Irecv(&k, 1, MPI_INT, my_pe - 1, MPI_TAG, MPI_COMM_WORLD, - &request); - if(my_pe < num_pes - 1) - MPI_Send(&key_array[size - 1], 1, MPI_INT, my_pe + 1, MPI_TAG, - MPI_COMM_WORLD ); - if(my_pe > 0) - MPI_Wait(&request, &status); - - // Check if it is smaller - int j = 0; - if( my_pe > 0 && size > 1 ) - if( k > key_array[0] ) - j++; - - // Check if keys correctly sorted - for(int i = 1; i < size; i++) - if(key_array[i - 1] > key_array[i]) - j++; - - delete[] key_array; - - if(j != 0) { - fprintf(stderr, "Processor %d: Full_verify: number of keys out of sort: %d\n", - my_pe, j ); - return false; - } - return true; -} - -void initGPU() -{ - // Calculation for local rank, taken from rccl-tests - int localRank = 0; - int proc = rocshmem_my_pe(); - int nProcs = rocshmem_n_pes(); - char hostname[1024]; - gethostname(hostname, 1024); - for (int i=0; i< 1024; i++) { - if (hostname[i] == '.') { - hostname[i] = '\0'; - break; - } - } - uint64_t hostHashs[nProcs]; - hostHashs[proc] = getHostHash(hostname); - MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); - for (int p=0; p 1) - iterations = atoi(argv[1]); - - int num_pes = rocshmem_n_pes(); - int my_pe = rocshmem_my_pe(); - - // Configure input and outputs - size_t size = 1024; //atoi(argv[2]); - int *keys, *outputKeys; - hipMalloc((void**)&keys, sizeof(int) * size); - hipMalloc((void**)&outputKeys, sizeof(int) * WG_SIZE); - -/* Generate random number sequence and subsequent keys on all procs */ - create_seq( find_my_seed( my_pe, - num_pes, - 4*(long)size*num_pes, - 314159265.00, /* Random number gen seed */ - 1220703125.00 ), /* Random number gen mult */ - 1220703125.00, keys, size ); /* Random number gen mult */ - - - // Init buffers - int *keyBuffer1, *keyBuffer2; - keyBuffer1 = (int*)rocshmem_malloc(sizeof(int) * size); - keyBuffer2 = (int*)rocshmem_malloc(sizeof(int) * size * 4); - - int *sendCount, *recvCount, *sendOffset, *recvOffset; - sendCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - recvCount = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - sendOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - recvOffset = (int*)rocshmem_malloc(sizeof(int) * MAX_PES); - - // Untimed run - rocshmem_barrier_all(); - sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2, - sendCount, recvCount, sendOffset, recvOffset, - outputKeys, size, team_world_dup, 1); - hipDeviceSynchronize(); - - // Verify correctness - if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) { - fprintf(stderr, "Wrong output\n"); - return -1; - } - - // Timed run - rocshmem_barrier_all(); - auto time_start = TIME_NOW; - sort<<<1, WG_SIZE>>>((int*)keys, keyBuffer1, keyBuffer2, - sendCount, recvCount, sendOffset, recvOffset, - outputKeys, size, team_world_dup, iterations); - hipDeviceSynchronize(); - double tot_time = (double)TIME_DIFF(TIME_NOW, time_start); - - double all_time = 0; - MPI_Allreduce(&tot_time, &all_time, 1, - MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - if(my_pe == 0) { - printf("Avg time:\t%f\tus\n", all_time / - (double)(1000.0 * iterations * num_pes)); - } - - // Verify correctness - if(!verify(outputKeys, keyBuffer2, outputKeys[MAX_KEY / num_pes - 1])) { - fprintf(stderr, "Wrong output\n"); - return -1; - } - - // Clean up - hipFree(keys); - hipFree(outputKeys); - rocshmem_free(keyBuffer1); - rocshmem_free(keyBuffer2); - rocshmem_free(sendCount); - rocshmem_free(recvCount); - rocshmem_free(sendOffset); - rocshmem_free(recvOffset); - rocshmem_finalize(); - return 0; -}