Unify environment variable management (#235)
* Add environment variable configuration infrastructure
- Namespace rocshmem::envvar
- Track all config env vars in per-category lists
- Remove duplicates from list of allowed env var types
- Reject negative inputs for unsigned integer types
- Accept empty strings for std::string
- Print error source location using C++20 std::source_location
- Unit tests
* Port environment variables
- ROCSHMEM_UNIQUEID_WITH_MPI
- ROCSHMEM_RO_DISABLE_IPC
- ROCSHMEM_BOOTSTRAP_TIMEOUT
- ROCSHMEM_BOOTSTRAP_HOSTID
- ROCSHMEM_BOOTSTRAP_SOCKET_IFNAME
- ROCSHMEM_RO_PROGRESS_DELAY
- ROCSHMEM_BOOTSTRAP_SOCKET_FAMILY
- ROCSHMEM_MAX_NUM_CONTEXTS
+ Merge the independent per-backend copies into a single variable
that is used by all three backends (IPC, RO, GDA).
+ Set default to 32 (for GDA); prior default for IPC and RO was 1024.
- ROCSHMEM_MAX_NUM_HOST_CONTEXTS
- ROCSHMEM_MAX_WF_BUFFERS
- ROCSHMEM_SQ_SIZE
- ROCSHMEM_RO_NET_CPU_QUEUE
+ Renamed from RO_NET_CPU_QUEUE
+ Change env var input type to bool, default to false
+ Invert code logic: setting RO_NET_CPU_QUEUE to anything
would /disable/ a variable gpu_queue, which defaulted to true.
Variable is now named config::ro::net_cpu_queue,
with all prior checks for gpu_queue inverted.
- ROCSHMEM_USE_IB_HCA
- ROCSHMEM_HEAP_SIZE
+ Defaults to 1L << 30 i.e. 1 GiB,
from default heap size in memory/heap_memory.hpp.
- ROCSHMEM_MAX_NUM_TEAMS
+ Unlike other env vars, this can be referenced from devices.
+ Function currently narrows from size_t to int: uses need to be audited
for safety and correctness in using size_t directly.
- ROCSHMEM_GDA_ALTERNATE_QP_PORTS
* New env var ROCSHMEM_DEBUG
- Debug levels:
+ NONE
+ VERSION
+ WARN
+ INFO
+ TRACE
- Currently unused - will be added later
- Mirrors RCCL debug control
* Remove rocshmem::rocshmem_env_config
* Change interface for GetClosestNicToGpu
to accept const char** instead of char**:
the pointed-to string does not need to be modified
- Files were not audited for inclusion of util.hpp only for env vars
---------
Signed-off-by: Omri Mor <Omri.Mor@amd.com>
Этот коммит содержится в:
@@ -34,8 +34,6 @@ namespace rocshmem {
|
||||
|
||||
__constant__ int* print_lock;
|
||||
|
||||
rocshmem_env_config rocshmem_env_;
|
||||
|
||||
typedef struct device_agent {
|
||||
hsa_agent_t agent;
|
||||
hsa_amd_memory_pool_t pool;
|
||||
@@ -123,77 +121,4 @@ void rocm_memory_lock_to_fine_grain(void* ptr, size_t size, void** gpu_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
rocshmem_env_config::rocshmem_env_config() {
|
||||
char* env_value = NULL;
|
||||
|
||||
env_value = getenv("ROCSHMEM_DISABLE_IPC");
|
||||
if (NULL != env_value) {
|
||||
disable_ipc = atoi(env_value);
|
||||
}
|
||||
// For backward compatibility, synonymous with ROCSHMEM_DISABLE_IPC
|
||||
env_value = getenv("ROCSHMEM_RO_DISABLE_IPC");
|
||||
if (NULL != env_value) {
|
||||
disable_ipc = atoi(env_value);
|
||||
}
|
||||
|
||||
env_value = getenv("ROCSHMEM_RO_PROGRESS_DELAY");
|
||||
if (nullptr != env_value) {
|
||||
ro_progress_delay = atoi(env_value);
|
||||
}
|
||||
|
||||
env_value = getenv("ROCSHMEM_UNIQUEID_WITH_MPI");
|
||||
if (nullptr != env_value) {
|
||||
uniqueid_with_mpi = atoi(env_value);
|
||||
}
|
||||
|
||||
env_value = getenv("ROCSHMEM_BOOTSTRAP_TIMEOUT");
|
||||
if (nullptr != env_value) {
|
||||
bootstrap_timeout = atoi(env_value);
|
||||
}
|
||||
|
||||
env_value = getenv("ROCSHMEM_BOOTSTRAP_HOSTID");
|
||||
if (nullptr != env_value) {
|
||||
bootstrap_hostid = std::string(env_value);
|
||||
}
|
||||
|
||||
env_value = getenv("ROCSHMEM_BOOTSTRAP_SOCKET_FAMILY");
|
||||
if (nullptr != env_value) {
|
||||
bootstrap_socket_family = std::string(env_value);
|
||||
}
|
||||
|
||||
env_value = getenv("ROCSHMEM_BOOTSTRAP_SOCKET_IFNAME");
|
||||
if (nullptr != env_value) {
|
||||
bootstrap_socket_ifname = std::string(env_value);
|
||||
}
|
||||
}
|
||||
|
||||
int rocshmem_env_config::get_disable_ipc() {
|
||||
return disable_ipc;
|
||||
}
|
||||
|
||||
int rocshmem_env_config::get_ro_progress_delay() {
|
||||
return ro_progress_delay;
|
||||
}
|
||||
|
||||
int rocshmem_env_config::get_uniqueid_with_mpi() {
|
||||
return uniqueid_with_mpi;
|
||||
}
|
||||
|
||||
int rocshmem_env_config::get_bootstrap_timeout() {
|
||||
return bootstrap_timeout;
|
||||
}
|
||||
|
||||
std::string rocshmem_env_config::get_bootstrap_hostid() {
|
||||
return bootstrap_hostid;
|
||||
}
|
||||
|
||||
std::string rocshmem_env_config::get_bootstrap_socket_family() {
|
||||
return bootstrap_socket_family;
|
||||
}
|
||||
|
||||
std::string rocshmem_env_config::get_bootstrap_socket_ifname() {
|
||||
return bootstrap_socket_ifname;
|
||||
}
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
Ссылка в новой задаче
Block a user