Unify environment variable management (#235)

* Add environment variable configuration infrastructure
  - Namespace rocshmem::envvar
  - Track all config env vars in per-category lists
  - Remove duplicates from list of allowed env var types
  - Reject negative inputs for unsigned integer types
  - Accept empty strings for std::string
  - Print error source location using C++20 std::source_location
  - Unit tests
* Port environment variables
  - ROCSHMEM_UNIQUEID_WITH_MPI
  - ROCSHMEM_RO_DISABLE_IPC
  - ROCSHMEM_BOOTSTRAP_TIMEOUT
  - ROCSHMEM_BOOTSTRAP_HOSTID
  - ROCSHMEM_BOOTSTRAP_SOCKET_IFNAME
  - ROCSHMEM_RO_PROGRESS_DELAY
  - ROCSHMEM_BOOTSTRAP_SOCKET_FAMILY
  - ROCSHMEM_MAX_NUM_CONTEXTS
    + Merge the independent per-backend copies into a single variable
      that is used by all three backends (IPC, RO, GDA).
    + Set default to 32 (for GDA); prior default for IPC and RO was 1024.
  - ROCSHMEM_MAX_NUM_HOST_CONTEXTS
  - ROCSHMEM_MAX_WF_BUFFERS
  - ROCSHMEM_SQ_SIZE
  - ROCSHMEM_RO_NET_CPU_QUEUE
    + Renamed from RO_NET_CPU_QUEUE
    + Change env var input type to bool, default to false
    + Invert code logic: setting RO_NET_CPU_QUEUE to anything
      would /disable/ a variable gpu_queue, which defaulted to true.
      Variable is now named config::ro::net_cpu_queue,
      with all prior checks for gpu_queue inverted.
  - ROCSHMEM_USE_IB_HCA
  - ROCSHMEM_HEAP_SIZE
    + Defaults to 1L << 30 i.e. 1 GiB,
      from default heap size in memory/heap_memory.hpp.
  - ROCSHMEM_MAX_NUM_TEAMS
    + Unlike other env vars, this can be referenced from devices.
    + Function currently narrows from size_t to int: uses need to be audited
      for safety and correctness in using size_t directly.
  - ROCSHMEM_GDA_ALTERNATE_QP_PORTS
* New env var ROCSHMEM_DEBUG
  - Debug levels:
    + NONE
    + VERSION
    + WARN
    + INFO
    + TRACE
  - Currently unused - will be added later
  - Mirrors RCCL debug control
* Remove rocshmem::rocshmem_env_config
* Change interface for GetClosestNicToGpu
  to accept const char** instead of char**:
  the pointed-to string does not need to be modified
  - Files were not audited for inclusion of util.hpp only for env vars
---------
Signed-off-by: Omri Mor <Omri.Mor@amd.com>
Этот коммит содержится в:
Omri Mor
2025-10-06 10:05:57 -07:00
коммит произвёл GitHub
родитель 0a4f8a83b9
Коммит a0fcbf8d35
29 изменённых файлов: 1070 добавлений и 277 удалений
-75
Просмотреть файл
@@ -34,8 +34,6 @@ namespace rocshmem {
__constant__ int* print_lock;
rocshmem_env_config rocshmem_env_;
typedef struct device_agent {
hsa_agent_t agent;
hsa_amd_memory_pool_t pool;
@@ -123,77 +121,4 @@ void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr,
}
}
rocshmem_env_config::rocshmem_env_config() {
char* env_value = NULL;
env_value = getenv("ROCSHMEM_DISABLE_IPC");
if (NULL != env_value) {
disable_ipc = atoi(env_value);
}
// For backward compatibility, synonymous with ROCSHMEM_DISABLE_IPC
env_value = getenv("ROCSHMEM_RO_DISABLE_IPC");
if (NULL != env_value) {
disable_ipc = atoi(env_value);
}
env_value = getenv("ROCSHMEM_RO_PROGRESS_DELAY");
if (nullptr != env_value) {
ro_progress_delay = atoi(env_value);
}
env_value = getenv("ROCSHMEM_UNIQUEID_WITH_MPI");
if (nullptr != env_value) {
uniqueid_with_mpi = atoi(env_value);
}
env_value = getenv("ROCSHMEM_BOOTSTRAP_TIMEOUT");
if (nullptr != env_value) {
bootstrap_timeout = atoi(env_value);
}
env_value = getenv("ROCSHMEM_BOOTSTRAP_HOSTID");
if (nullptr != env_value) {
bootstrap_hostid = std::string(env_value);
}
env_value = getenv("ROCSHMEM_BOOTSTRAP_SOCKET_FAMILY");
if (nullptr != env_value) {
bootstrap_socket_family = std::string(env_value);
}
env_value = getenv("ROCSHMEM_BOOTSTRAP_SOCKET_IFNAME");
if (nullptr != env_value) {
bootstrap_socket_ifname = std::string(env_value);
}
}
int rocshmem_env_config::get_disable_ipc() {
return disable_ipc;
}
int rocshmem_env_config::get_ro_progress_delay() {
return ro_progress_delay;
}
int rocshmem_env_config::get_uniqueid_with_mpi() {
return uniqueid_with_mpi;
}
int rocshmem_env_config::get_bootstrap_timeout() {
return bootstrap_timeout;
}
std::string rocshmem_env_config::get_bootstrap_hostid() {
return bootstrap_hostid;
}
std::string rocshmem_env_config::get_bootstrap_socket_family() {
return bootstrap_socket_family;
}
std::string rocshmem_env_config::get_bootstrap_socket_ifname() {
return bootstrap_socket_ifname;
}
} // namespace rocshmem