@@ -1619,7 +1619,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
|
||||
// Allows runtime to track some information about the stream.
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
|
||||
const char* kernelNameStr, bool lockAcquired) {
|
||||
stream = ihipSyncAndResolveStream(stream, lockAcquired);
|
||||
if (stream == nullptr || stream != stream->getCtx()->_defaultStream){
|
||||
stream = ihipSyncAndResolveStream(stream, lockAcquired);
|
||||
}
|
||||
lp->grid_dim.x = grid.x;
|
||||
lp->grid_dim.y = grid.y;
|
||||
lp->grid_dim.z = grid.z;
|
||||
|
||||
@@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
|
||||
void** impCoopParams = 0) {
|
||||
void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -192,7 +192,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
if (impCoopParams) {
|
||||
const auto p{static_cast<const char*>(*impCoopParams)};
|
||||
// The sixth index is for multi-grid synchronization
|
||||
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
|
||||
kernargs.insert((kernargs.cend() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
|
||||
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
|
||||
}
|
||||
|
||||
@@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
|
||||
hc::completion_future cf;
|
||||
|
||||
if (coopAV) {
|
||||
lp.av = coopAV;
|
||||
}
|
||||
|
||||
lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(),
|
||||
(startEvent || stopEvent) ? &cf : nullptr
|
||||
#if (__hcc_workweek__ > 17312)
|
||||
@@ -399,6 +403,82 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
|
||||
return result;
|
||||
}
|
||||
|
||||
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
|
||||
{
|
||||
if (f->_is_code_object_v3) {
|
||||
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
|
||||
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
|
||||
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
|
||||
*usedLDS = header->group_segment_fixed_size;
|
||||
}
|
||||
else {
|
||||
const auto header = f->_header;
|
||||
// VGPRs granularity is 4
|
||||
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
|
||||
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
|
||||
*usedSGPRS = header->wavefront_sgpr_count + 2;
|
||||
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
|
||||
*usedLDS = header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if (ctx == nullptr) {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
|
||||
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
|
||||
size_t wavefrontSize = prop.warpSize;
|
||||
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
|
||||
|
||||
const size_t simdPerCU = 4;
|
||||
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
|
||||
|
||||
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
|
||||
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
|
||||
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
*numBlocks = vgprs_alu_occupancy / numWavefronts;
|
||||
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
|
||||
|
||||
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// kernel for initializing GWS
|
||||
// nwm1 is the total number of work groups minus 1
|
||||
@@ -412,13 +492,16 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes,
|
||||
hipStream_t stream, hip_impl::program_state& ps) {
|
||||
|
||||
#if (__hcc_workweek__ >= 20093)
|
||||
hipError_t result;
|
||||
|
||||
|
||||
if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) {
|
||||
if (f == nullptr || kernelParams == nullptr) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (!stream->getDevice()->_props.cooperativeLaunch) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
@@ -459,28 +542,44 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
kd->_kernarg_layout = *reinterpret_cast<const std::vector<
|
||||
std::pair<std::size_t, std::size_t>>*>(kargs.getHandle());
|
||||
|
||||
GET_TLS();
|
||||
uint32_t numBlocksPerSm;
|
||||
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd,
|
||||
stream->getDevice()->_props.warpSize, sharedMemBytes);
|
||||
if (result != hipSuccess) {
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount;
|
||||
|
||||
//check to see if the workload fits on the GPU
|
||||
if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks){
|
||||
return hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
|
||||
void *gwsKernelParam[1];
|
||||
// calculate total number of work groups minus 1 for the main kernel
|
||||
uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1;
|
||||
gwsKernelParam[0] = &nwm1;
|
||||
|
||||
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
#endif
|
||||
hc::accelerator acc = stream->getDevice()->_acc;
|
||||
// create a cooperative accelerated view for launching gws and main kernels
|
||||
hc::accelerator_view coopAV = acc.create_cooperative_view();
|
||||
|
||||
GET_TLS();
|
||||
// launch the init_gws kernel to initialize the GWS
|
||||
// wait for this stream to finish operations
|
||||
stream->locked_wait();
|
||||
|
||||
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
coopAV.acquire_locked_hsa_queue();
|
||||
|
||||
// launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue
|
||||
result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
|
||||
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
|
||||
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
|
||||
coopAV.release_locked_hsa_queue();
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
|
||||
@@ -488,60 +587,88 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
void* impCoopParams[1];
|
||||
impCoopParams[0] = &impCoopArg;
|
||||
|
||||
// launch the main kernel
|
||||
// launch the main kernel in the cooperative queue
|
||||
result = ihipModuleLaunchKernel(tls, kd,
|
||||
gridDim.x * blockDimX.x,
|
||||
gridDim.y * blockDimX.y,
|
||||
gridDim.z * blockDimX.z,
|
||||
blockDimX.x, blockDimX.y, blockDimX.z,
|
||||
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, true, impCoopParams);
|
||||
nullptr, 0, true, impCoopParams, &coopAV);
|
||||
|
||||
stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
coopAV.release_locked_hsa_queue();
|
||||
|
||||
// wait on the dispatch on the dedicated cooperative queue to finish
|
||||
coopAV.wait(hc::hcWaitModeActive);
|
||||
|
||||
|
||||
return result;
|
||||
#else
|
||||
return hipErrorInvalidConfiguration;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
__attribute__((visibility("default")))
|
||||
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags, hip_impl::program_state& ps) {
|
||||
|
||||
#if (__hcc_workweek__ >= 20093)
|
||||
hipError_t result;
|
||||
|
||||
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
vector<hipStream_t> streams;
|
||||
vector<uint64_t> deviceIDs;
|
||||
// check to see if we have valid distinct streams/devices, if cooperative multi device
|
||||
// launch is supported and if grid/block dimensions are valid
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
if (lp.stream == nullptr){
|
||||
return hipErrorInvalidResourceHandle;
|
||||
}
|
||||
|
||||
auto it = find(streams.begin(), streams.end(), lp.stream);
|
||||
if (it == streams.end()){
|
||||
streams.push_back(lp.stream);
|
||||
} else{
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
const ihipDevice_t* currentDevice = lp.stream->getDevice();
|
||||
auto it1 = find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId);
|
||||
if (it1 == deviceIDs.end()){
|
||||
deviceIDs.push_back(currentDevice->_deviceId);
|
||||
} else {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
if (!currentDevice->_props.cooperativeMultiDeviceLaunch) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 ||
|
||||
lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0){
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
}
|
||||
|
||||
hipFunction_t* gwsKds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
|
||||
if (kds == nullptr || gwsKds == nullptr) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
vector<hipFunction_t> gwsKds;
|
||||
vector<hipFunction_t> kds;
|
||||
|
||||
GET_TLS();
|
||||
// prepare all kernel descriptors for initializing the GWS and the main kernels per device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
if (lp.stream == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
gwsKds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
hip_impl::target_agent(lp.stream)));
|
||||
if (gwsKds[i] == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
|
||||
@@ -550,23 +677,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
gwsKargs.getHandle());
|
||||
|
||||
|
||||
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
kds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream)));
|
||||
if (kds[i] == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(lp.func));
|
||||
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
|
||||
kargs.getHandle());
|
||||
|
||||
uint32_t numBlocksPerSm;
|
||||
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i],
|
||||
lp.stream->getDevice()->_props.warpSize, lp.sharedMem);
|
||||
if (result != hipSuccess) {
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount;
|
||||
|
||||
//check to see if the workload fits on the GPU
|
||||
if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks){
|
||||
return hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
}
|
||||
|
||||
vector<hc::accelerator_view> coopAVs;
|
||||
|
||||
// create cooperative accelerated views for launching gws and main kernels on each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc;
|
||||
coopAVs.push_back(acc.create_cooperative_view());
|
||||
}
|
||||
|
||||
mg_sync *mg_sync_ptr = 0;
|
||||
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
|
||||
vector<mg_info *> mg_info_ptr;
|
||||
|
||||
|
||||
GET_TLS();
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -576,7 +722,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
|
||||
uint all_sum = 0;
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
|
||||
mg_info *mg_info_temp;
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < i; ++j) {
|
||||
@@ -584,6 +731,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
}
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
mg_info_ptr.push_back(mg_info_temp);
|
||||
// calculate the sum of sizes of all grids
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
|
||||
@@ -592,10 +740,10 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
|
||||
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
launchParamsList[i].stream->locked_wait();
|
||||
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
#endif
|
||||
coopAVs[i].acquire_locked_hsa_queue();
|
||||
}
|
||||
|
||||
// launch the init_gws kernel to initialize the GWS for each device
|
||||
@@ -607,14 +755,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
gwsKernelParam[0] = &nwm1;
|
||||
|
||||
result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1,
|
||||
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
|
||||
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
launchParamsList[j].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
coopAVs[i].release_locked_hsa_queue();
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
@@ -660,14 +807,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
lp.blockDim.x, lp.blockDim.y,
|
||||
lp.blockDim.z, lp.sharedMem,
|
||||
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
|
||||
true, impCoopParams);
|
||||
true, impCoopParams, &coopAVs[i]);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
launchParamsList[j].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
coopAVs[i].release_locked_hsa_queue();
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
@@ -682,13 +828,14 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
// unlock all streams
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
launchParamsList[i].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
coopAVs[i].release_locked_hsa_queue();
|
||||
}
|
||||
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
// wait on the dispatch on cooperative queues on each device to finish
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
coopAVs[i].wait(hc::hcWaitModeActive);
|
||||
}
|
||||
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
@@ -696,6 +843,9 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
}
|
||||
|
||||
return result;
|
||||
#else
|
||||
return hipErrorInvalidConfiguration;
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace hip_impl {
|
||||
@@ -1264,29 +1414,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
|
||||
{
|
||||
if (f->_is_code_object_v3) {
|
||||
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
|
||||
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
|
||||
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
|
||||
*usedLDS = header->group_segment_fixed_size;
|
||||
}
|
||||
else {
|
||||
const auto header = f->_header;
|
||||
// VGPRs granularity is 4
|
||||
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
|
||||
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
|
||||
*usedSGPRS = header->wavefront_sgpr_count + 2;
|
||||
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
|
||||
*usedLDS = header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
|
||||
hipFunction_t f, size_t dynSharedMemPerBlk,
|
||||
uint32_t blockSizeLimit)
|
||||
@@ -1409,59 +1536,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
|
||||
gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if (ctx == nullptr) {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
|
||||
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
|
||||
size_t wavefrontSize = prop.warpSize;
|
||||
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
|
||||
|
||||
const size_t simdPerCU = 4;
|
||||
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
|
||||
|
||||
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
|
||||
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
|
||||
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
*numBlocks = vgprs_alu_occupancy / numWavefronts;
|
||||
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
|
||||
|
||||
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
|
||||
Ссылка в новой задаче
Block a user