diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index ed5ab85923..40b4928d21 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1196,6 +1196,20 @@ hipError_t hipStreamGetPriority(hipStream_t stream, int* priority); hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize, const uint32_t* cuMask); +/** + * @brief Get CU mask associated with an asynchronous stream + * + * @param[in] stream stream to be queried + * @param[in] cuMaskSize number of the block of memories (uint32_t *) allocated by user + * @param[out] cuMask Pointer to a pre-allocated block of memories (uint32_t *) in which + * the stream's CU mask is returned. The CU mask is returned in a chunck of 32 bits where + * each active bit represents one active CU + * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue + * + * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy + */ +hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32_t* cuMask); + /** * Stream CallBack struct */ diff --git a/rocclr/hip_hcc.def.in b/rocclr/hip_hcc.def.in index cc8cd86e0a..f6c93581a4 100755 --- a/rocclr/hip_hcc.def.in +++ b/rocclr/hip_hcc.def.in @@ -266,3 +266,4 @@ hipMemcpyHtoA hipMemcpyParam2DAsync __gnu_h2f_ieee __gnu_f2h_ieee +hipExtStreamGetCUMask diff --git a/rocclr/hip_hcc.map.in b/rocclr/hip_hcc.map.in index 5b5dcf692a..82ddc72323 100755 --- a/rocclr/hip_hcc.map.in +++ b/rocclr/hip_hcc.map.in @@ -275,6 +275,7 @@ global: hipMemcpyHtoA; hipMemcpyParam2DAsync; __hipGetPCH; + hipExtStreamGetCUMask; }; local: *; diff --git a/rocclr/hip_internal.hpp b/rocclr/hip_internal.hpp index 7e0cc8b9a2..857ae8e07f 100755 --- a/rocclr/hip_internal.hpp +++ b/rocclr/hip_internal.hpp @@ -156,6 +156,8 @@ namespace hip { unsigned int Flags() const { return flags_; } /// Returns the priority for the current stream Priority GetPriority() const { return priority_; } + /// Returns the CU mask for the current stream + const std::vector GetCUMask() const { return cuMask_; } /// Sync all non-blocking streams static void syncNonBlockingStreams(); diff --git a/rocclr/hip_stream.cpp b/rocclr/hip_stream.cpp index 109e8c8b7d..24312f018d 100755 --- a/rocclr/hip_stream.cpp +++ b/rocclr/hip_stream.cpp @@ -396,3 +396,87 @@ hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) { HIP_RETURN(hipSuccess); } + +// ================================================================================================ +hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32_t* cuMask) { + HIP_INIT_API(hipExtStreamGetCUMask, stream, cuMaskSize, cuMask); + + if (cuMask == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + int deviceId = hip::getCurrentDevice()->deviceId(); + auto* deviceHandle = g_devices[deviceId]->devices()[0]; + const auto& info = deviceHandle->info(); + + // find the minimum cuMaskSize required to present the CU mask bit-array in a patch of 32 bits + // and return error if the cuMaskSize argument is less than cuMaskSizeRequired + uint32_t cuMaskSizeRequired = info.maxComputeUnits_ / 32 + + ((info.maxComputeUnits_ % 32) ? 1 : 0); + + if (cuMaskSize < cuMaskSizeRequired) { + HIP_RETURN(hipErrorInvalidValue); + } + + // make a default CU mask bit-array where all CUs are active + // this default mask will be returned when there is no + // custom or global CU mask defined + std::vector defaultCUMask; + uint32_t temp = 0; + uint32_t bit_index = 0; + for (uint32_t i = 0; i < info.maxComputeUnits_; i++) { + temp |= 1UL << bit_index; + if (bit_index >= 32) { + defaultCUMask.push_back(temp); + temp = 0; + bit_index = 0; + temp |= 1UL << bit_index; + } + bit_index += 1; + } + if (bit_index != 0) { + defaultCUMask.push_back(temp); + } + + // if the stream is null then either return globalCUMask_ (if it is defined) + // or return defaultCUMask + if (stream == nullptr) { + if (info.globalCUMask_.size() != 0) { + std::copy(info.globalCUMask_.begin(), info.globalCUMask_.end(), cuMask); + } else { + std::copy(defaultCUMask.begin(), defaultCUMask.end(), cuMask); + } + } else { + // if the stream is not null then get the stream's CU mask and return one of the below cases + // case1 if globalCUMask_ is defined then return the AND of globalCUMask_ and stream's CU mask + // case2 if globalCUMask_ is not defined then retuen AND of defaultCUMask and stream's CU mask + // in both cases above if stream's CU mask is empty then either globalCUMask_ (for case1) + // or defaultCUMask(for case2) will be returned + std::vector streamCUMask; + streamCUMask = reinterpret_cast(stream)->GetCUMask(); + std::vector mask = {}; + if (info.globalCUMask_.size() != 0) { + for (uint32_t i = 0; i < std::min(streamCUMask.size(), info.globalCUMask_.size()); i++) { + mask.push_back(streamCUMask[i] & info.globalCUMask_[i]); + } + } else { + for (uint32_t i = 0; i < std::min(streamCUMask.size(), defaultCUMask.size()); i++) { + mask.push_back(streamCUMask[i] & defaultCUMask[i]); + } + // check to make sure after ANDing streamCUMask (custom-defined) with global CU mask, + //we have non-zero mask, oterwise just return either globalCUMask_ or defaultCUMask + bool zeroCUMask = true; + for (auto m : mask) { + if (m != 0) { + zeroCUMask = false; + break; + } + } + if (zeroCUMask) { + mask = (info.globalCUMask_.size() != 0) ? info.globalCUMask_ : defaultCUMask; + } + std::copy(mask.begin(), mask.end(), cuMask); + } + } + HIP_RETURN(hipSuccess); +} \ No newline at end of file