Updating inline asm to not require explicit L1 cache invalidation (#270)

Αυτή η υποβολή περιλαμβάνεται σε:
Stanley Tsang
2020-09-25 13:46:26 -06:00
υποβλήθηκε από GitHub
γονέας ee262819a7
υποβολή acca2ae20a
@@ -99,9 +99,8 @@ class ncclLLPrimitives {
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
do {
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i4) : "v"(src));
if (checkAbort(i, 0)) break;
} while ((i4[1] != flag) || (i4[3] != flag));
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
@@ -123,9 +122,8 @@ class ncclLLPrimitives {
i4[1] = flag;
i4[2] = (val >> 32);
i4[3] = flag;
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : : "v"(dst), "v"(i4));
#else
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
#endif