diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md
index 6b2d2e26ad..2e6c9fa02e 100644
--- a/projects/rccl/CHANGELOG.md
+++ b/projects/rccl/CHANGELOG.md
@@ -8,16 +8,16 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 
 * Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
 * Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes.
-* Suboptimal algorithmic switching point for AllReduce on MI300x
+* Suboptimal algorithmic switching point for AllReduce on MI300x.
 * Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes.
 
 ### Added
 
 * Added new GPU target `gfx950`.
-* Added support for `unroll=1` in device-code generation to improve performance
-* Set a default of 112 channels for a single node with `8 * gfx950`
-* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the
-  environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
+* Added support for `unroll=1` in device-code generation to improve performance,
+* Set a default of 112 channels for a single node with `8 * gfx950`,
+* Enabled LL128 protocol on `gfx950`.
+* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
 * Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set.
 * Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges.
 * LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL.
@@ -310,4 +310,4 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 ### Changed
 - Switched to hip-clang as default compiler
 ### Deprecated
-- Deprecated hcc build
\ No newline at end of file
+- Deprecated hcc build
diff --git a/projects/rccl/src/device/generate.py b/projects/rccl/src/device/generate.py
index bcc15aefb6..58096432f9 100755
--- a/projects/rccl/src/device/generate.py
+++ b/projects/rccl/src/device/generate.py
@@ -310,7 +310,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
   for fn in primary_funcs:
     sym = paste("_", "ncclDevFunc", *fn)
     if fn[2] == "LL128":
-      out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
+      out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
       out("%s %s();\n#else\n" % (func_declaration, sym))
       fn_ll = fn[:2] + ("LL",) + fn[3:]
       sym_ll = paste("_", "ncclDevFunc", *fn_ll)
@@ -327,7 +327,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
     if unroll != "1": continue
     sym = paste("_", "ncclDevFunc", *fn)
     if fn[2] == "LL128":
-      out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
+      out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
       out("/*%4d*/ %s,\n#else\n" % (index1, sym))
       fn_ll = fn[:2] + ("LL",) + fn[3:]
       sym_ll = paste("_", "ncclDevFunc", *fn_ll)
@@ -344,7 +344,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
     if unroll != "2": continue
     sym = paste("_", "ncclDevFunc", *fn)
     if fn[2] == "LL128":
-      out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
+      out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
       out("/*%4d*/ %s,\n#else\n" % (index2, sym))
       fn_ll = fn[:2] + ("LL",) + fn[3:]
       sym_ll = paste("_", "ncclDevFunc", *fn_ll)
@@ -361,7 +361,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
     if unroll != "4": continue
     sym = paste("_", "ncclDevFunc", *fn)
     if fn[2] == "LL128":
-      out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
+      out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
       out("/*%4d*/ %s,\n#else\n" % (index4, sym))
       fn_ll = fn[:2] + ("LL",) + fn[3:]
       sym_ll = paste("_", "ncclDevFunc", *fn_ll)
@@ -531,7 +531,7 @@ for name in name_to_funcs.keys():
       (coll, algo, proto, redop, ty, unroll) = fn
       sym = paste("_", coll, algo, proto, redop, ty, unroll)
       if proto == "LL128":
-        out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
+        out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
       out(
         "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {unroll})\n"
         .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc
index 4944ce3a9c..2b4ea5ecce 100644
--- a/projects/rccl/src/graph/tuning.cc
+++ b/projects/rccl/src/graph/tuning.cc
@@ -657,7 +657,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       // Enable LL128 by default only on gfx90a with available tuning table
       pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
         ((IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") ||
-          IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) && comm->topo->ll128Enabled) ? 1 : 0;
+          IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
+          IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->ll128Enabled) ? 1 : 0;
 #else
       pEnable = 0;
 #endif
diff --git a/projects/rccl/src/rccl_wrap.cc b/projects/rccl/src/rccl_wrap.cc
index 397cbc7664..f77b9fb7ee 100644
--- a/projects/rccl/src/rccl_wrap.cc
+++ b/projects/rccl/src/rccl_wrap.cc
@@ -59,9 +59,10 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s
         }
       }
 #endif
-    } else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) {
-      // Warn that model detection for MI300 (or future others) did not work as expected
-      // Add supported archs to this condition as they come (e.g. gfx950)
+    } else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
+               IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
+      // Warn that model detection for the above listed architectures did not work as expected
+      // Add supported archs to this condition as they come
       // Also make sure the tuning_model and model detection are updated for new archs
       static bool failedWarn = false;
       if (!failedWarn) {
@@ -120,4 +121,4 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count,
   RCCL_STATIC_EXPOSE_CHECK();
   maxCount = ncclFuncMaxSendRecvCount(func, nRanks, count);
   return ncclSuccess;
-}
\ No newline at end of file
+}