diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index 08abeaab31..623db25dbb 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3561,6 +3561,7 @@ HSAILKernel::init(bool finalize) //compile kernel down to ISA if (finalize) { std::string options(compileOptions_.c_str()); + flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != 0) ? true: false; options.append(" -just-kernel="); options.append(openClKernelName.c_str()); // Append an option so that we can selectively enable a SCOption on CZ diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp index b236b4343b..7147128ff9 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp @@ -889,6 +889,10 @@ public: bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } + //! Returns TRUE if kernel is internal kernel + bool isInternalKernel() const + { return (flags_.internalKernel_) ? true : false; } + //! Finds local workgroup size void findLocalWorkSize( size_t workDim, //!< Work dimension @@ -961,6 +965,7 @@ private: uint imageEna_: 1; //!< Kernel uses images uint imageWriteEna_: 1; //!< Kernel uses image writes uint dynamicParallelism_: 1; //!< Dynamic parallelism enabled + uint internalKernel_: 1; //!< True: internal kernel }; uint value_; Flags(): value_(0) {} diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index f519f0ffe4..193c917de3 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -1771,237 +1771,280 @@ VirtualGPU::submitKernelInternalHSA( dbgManager->allocParamMemList(numParams); } - // Program the kernel arguments for the GPU execution - hsa_kernel_dispatch_packet_t* aqlPkt = - hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem, - vmDefQueue, &vmParentWrap, memList); - if (NULL == aqlPkt) { - LogError("Couldn't load kernel arguments"); - return false; + size_t newOffset[3] = {0, 0, 0}; + size_t newGlobalSize[3] = {0, 0, 0}; + + int dim = -1; + int iteration = 1; + size_t globalStep = 0; + for (uint i = 0; i < sizes.dimensions(); i++) { + newGlobalSize[i] = sizes.global()[i]; + newOffset[i] = sizes.offset()[i]; + } + // Check if it is blit kernel. If it is, then check if split is needed. + if (hsaKernel.isInternalKernel()) { + // Calculate new group size for each submission + for (uint i = 0; i < sizes.dimensions(); i++) { + if (sizes.global()[i] > static_cast(0xffffffff)) { + dim = i; + iteration = sizes.global()[i] / 0xC0000000 + + ((sizes.global()[i] % 0xC0000000) ? 1: 0); + globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration + * sizes.local()[dim]; + break; + } + } } - gslMemObject scratch = NULL; - uint scratchOffset = 0; - // Check if the device allocated more registers than the old setup - if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { - const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing()); - const std::vector& mems = scratchObj->memObjs_; - scratch = mems[0]->gslResource(); - memList.push_back(mems[0]); - scratchOffset = scratchObj->offset_; - } - - // Add GSL handle to the memory list for VidMM - for (uint i = 0; i < memList.size(); ++i) { - addVmMemory(memList[i]); - } - - // HW Debug for the kernel? - HwDbgKernelInfo kernelInfo; - HwDbgKernelInfo *pKernelInfo = NULL; - - if (dbgManager) { - buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); - pKernelInfo = &kernelInfo; - } - - GpuEvent gpuEvent; - // Run AQL dispatch in HW - runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_, - scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo); - - if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) { - dbgManager->executePostDispatchCallBack(); - } - - if (hsaKernel.dynamicParallelism()) { - // Make sure exculsive access to the device queue - amd::ScopedLock(defQueue->lock()); - - if (GPU_PRINT_CHILD_KERNEL != 0) { - waitForEvent(&gpuEvent); - - AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); - uint p = 0; - for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { - if (wraps[i].state != 0) { - uint j; - if (p == GPU_PRINT_CHILD_KERNEL) { - break; - } - p++; - std::stringstream print; - print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); - print << "Slot#: " << i << "\n"; - print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; - print << "\tcommand_id: " << wraps[i].command_id << "\n"; - print << "\tchild_counter: " << wraps[i].child_counter << "\n"; - print << "\tcompletion: " << wraps[i].completion << "\n"; - print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; - print << "\twait_list: " << wraps[i].wait_list << "\n"; - print << "\twait_num: " << wraps[i].wait_num << "\n"; - uint offsEvents = wraps[i].wait_list - - gpuDefQueue->virtualQueue_->vmAddress(); - size_t* events = reinterpret_cast( - gpuDefQueue->virtualQueue_->data() + offsEvents); - for (j = 0; j < wraps[i].wait_num; ++j) { - uint offs = static_cast(events[j]) - - gpuDefQueue->virtualQueue_->vmAddress(); - AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); - print << "Wait Event#: " << j << "\n"; - print << "\tState: " << eventD->state << - "; Counter: " << eventD->counter << "\n"; - } - print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; - print << wraps[i].aql.workgroup_size_y << ", "; - print << wraps[i].aql.workgroup_size_z << "]\n"; - print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; - print << wraps[i].aql.grid_size_y << ", "; - print << wraps[i].aql.grid_size_z << "]\n"; - - uint64_t* kernels = (uint64_t*)( - const_cast(hsaKernel.prog().kernelTable())->map(this)); - for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { - if (kernels[j] == wraps[i].aql.kernel_object) { - break; - } - } - const_cast(hsaKernel.prog().kernelTable())->unmap(this); - HSAILKernel* child = NULL; - for (auto it = hsaKernel.prog().kernels().begin(); - it != hsaKernel.prog().kernels().end(); ++it) { - if (j == static_cast(it->second)->index()) { - child = static_cast(it->second); - } - } - if (child == NULL) { - printf("Error: couldn't find child kernel!\n"); - continue; - } - const uint64_t kernarg_address = - static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); - uint offsArg = kernarg_address - - gpuDefQueue->virtualQueue_->vmAddress(); - address argum = gpuDefQueue->virtualQueue_->data() + offsArg; - print << "Kernel: " << child->name() << "\n"; - static const char* Names[HSAILKernel::ExtraArguments] = { - "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; - for (j = 0; j < HSAILKernel::ExtraArguments; ++j) { - print << "\t" << Names[j] << *(size_t*)argum; - print << "\n"; - argum += sizeof(size_t); - } - for (j = 0; j < child->numArguments(); ++j) { - print << "\t" << child->argument(j)->name_ << ": "; - for (int s = child->argument(j)->size_ - 1; s >= 0; --s) { - print.width(2); - print.fill('0'); - print << (uint32_t)(argum[s]); - } - argum += child->argument(j)->size_; - print << "\n"; - } - printf("%s", print.str().c_str()); - } + for (int j = 0; j < iteration; j++) { + // Reset global size for dimension dim if split is needed + if (dim != -1) { + newOffset[dim] = sizes.offset()[dim] + globalStep * j; + if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && + (j != (iteration - 1))) { + newGlobalSize[dim] = globalStep; + } + else { + newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; } } - if (!dev().settings().useDeviceQueue_) { - // Add the termination handshake to the host queue - virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(), - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - 0, dev().settings().useDeviceQueue_); + amd::NDRangeContainer tmpSizes(sizes.dimensions(), + &newOffset[0], &newGlobalSize[0], + &(const_cast(sizes).local()[0])); + + // Program the kernel arguments for the GPU execution + hsa_kernel_dispatch_packet_t* aqlPkt = + hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem, + vmDefQueue, &vmParentWrap, memList); + if (NULL == aqlPkt) { + LogError("Couldn't load kernel arguments"); + return false; } - // Get the global loop start before the scheduler - mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart(); - static_cast(gpuDefQueue->blitMgr()).runScheduler( - *gpuDefQueue->virtualQueue_, - *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, - gpuDefQueue->vqHeader_->aql_slot_num); - const static bool FlushL2 = true; - gpuDefQueue->flushCUCaches(FlushL2); - - // Get the address of PM4 template and add write it to params - //! @note DMA flush must not occur between patch and the scheduler - mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart(); - - // Program parameters for the scheduler - SchedulerParam* param = &reinterpret_cast - (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; - param->signal = 1; - // Scale clock to 1024 to avoid 64 bit div in the scheduler - param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; - param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; - param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); - param->releaseHostCP = 0; - param->parentAQL = vmParentWrap; - param->dedicatedQueue = dev().settings().useDeviceQueue_; - - // Fill the scratch buffer information - if (hsaKernel.prog().maxScratchRegs() > 0) { - gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0]; - param->scratchSize = scratchBuf->size(); - param->scratch = scratchBuf->vmAddress(); - param->numMaxWaves = 32 * dev().info().maxComputeUnits_; - param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; - memList.push_back(scratchBuf); + gslMemObject scratch = NULL; + uint scratchOffset = 0; + // Check if the device allocated more registers than the old setup + if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { + const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing()); + const std::vector& mems = scratchObj->memObjs_; + scratch = mems[0]->gslResource(); + memList.push_back(mems[0]); + scratchOffset = scratchObj->offset_; } - else { - param->numMaxWaves = 0; - param->scratchSize = 0; - param->scratch = 0; - param->scratchOffset = 0; - } - - // Add all kernels in the program to the mem list. - //! \note Runtime doesn't know which one will be called - hsaKernel.prog().fillResListWithKernels(memList); // Add GSL handle to the memory list for VidMM for (uint i = 0; i < memList.size(); ++i) { - gpuDefQueue->addVmMemory(memList[i]); + addVmMemory(memList[i]); } - mcaddr signalAddr = gpuDefQueue->schedParams_->vmAddress() + - gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); - gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent, - gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, - signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32); + // HW Debug for the kernel? + HwDbgKernelInfo kernelInfo; + HwDbgKernelInfo *pKernelInfo = NULL; + + if (dbgManager) { + buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); + pKernelInfo = &kernelInfo; + } + + GpuEvent gpuEvent; + // Run AQL dispatch in HW + runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_, + scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo); + + if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) { + dbgManager->executePostDispatchCallBack(); + } + + if (hsaKernel.dynamicParallelism()) { + // Make sure exculsive access to the device queue + amd::ScopedLock(defQueue->lock()); + + if (GPU_PRINT_CHILD_KERNEL != 0) { + waitForEvent(&gpuEvent); + + AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); + uint p = 0; + for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { + if (wraps[i].state != 0) { + uint j; + if (p == GPU_PRINT_CHILD_KERNEL) { + break; + } + p++; + std::stringstream print; + print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); + print << "Slot#: " << i << "\n"; + print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; + print << "\tcommand_id: " << wraps[i].command_id << "\n"; + print << "\tchild_counter: " << wraps[i].child_counter << "\n"; + print << "\tcompletion: " << wraps[i].completion << "\n"; + print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; + print << "\twait_list: " << wraps[i].wait_list << "\n"; + print << "\twait_num: " << wraps[i].wait_num << "\n"; + uint offsEvents = wraps[i].wait_list - + gpuDefQueue->virtualQueue_->vmAddress(); + size_t* events = reinterpret_cast( + gpuDefQueue->virtualQueue_->data() + offsEvents); + for (j = 0; j < wraps[i].wait_num; ++j) { + uint offs = static_cast(events[j]) - + gpuDefQueue->virtualQueue_->vmAddress(); + AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); + print << "Wait Event#: " << j << "\n"; + print << "\tState: " << eventD->state << + "; Counter: " << eventD->counter << "\n"; + } + print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; + print << wraps[i].aql.workgroup_size_y << ", "; + print << wraps[i].aql.workgroup_size_z << "]\n"; + print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; + print << wraps[i].aql.grid_size_y << ", "; + print << wraps[i].aql.grid_size_z << "]\n"; + + uint64_t* kernels = (uint64_t*)( + const_cast(hsaKernel.prog().kernelTable())->map(this)); + for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { + if (kernels[j] == wraps[i].aql.kernel_object) { + break; + } + } + const_cast(hsaKernel.prog().kernelTable())->unmap(this); + HSAILKernel* child = NULL; + for (auto it = hsaKernel.prog().kernels().begin(); + it != hsaKernel.prog().kernels().end(); ++it) { + if (j == static_cast(it->second)->index()) { + child = static_cast(it->second); + } + } + if (child == NULL) { + printf("Error: couldn't find child kernel!\n"); + continue; + } + const uint64_t kernarg_address = + static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); + uint offsArg = kernarg_address - + gpuDefQueue->virtualQueue_->vmAddress(); + address argum = gpuDefQueue->virtualQueue_->data() + offsArg; + print << "Kernel: " << child->name() << "\n"; + static const char* Names[HSAILKernel::ExtraArguments] = { + "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; + for (j = 0; j < HSAILKernel::ExtraArguments; ++j) { + print << "\t" << Names[j] << *(size_t*)argum; + print << "\n"; + argum += sizeof(size_t); + } + for (j = 0; j < child->numArguments(); ++j) { + print << "\t" << child->argument(j)->name_ << ": "; + for (int s = child->argument(j)->size_ - 1; s >= 0; --s) { + print.width(2); + print.fill('0'); + print << (uint32_t)(argum[s]); + } + argum += child->argument(j)->size_; + print << "\n"; + } + printf("%s", print.str().c_str()); + } + } + } + + if (!dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue + virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + 0, dev().settings().useDeviceQueue_); + } + + // Get the global loop start before the scheduler + mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart(); + static_cast(gpuDefQueue->blitMgr()).runScheduler( + *gpuDefQueue->virtualQueue_, + *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, + gpuDefQueue->vqHeader_->aql_slot_num); + const static bool FlushL2 = true; + gpuDefQueue->flushCUCaches(FlushL2); + + // Get the address of PM4 template and add write it to params + //! @note DMA flush must not occur between patch and the scheduler + mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart(); + + // Program parameters for the scheduler + SchedulerParam* param = &reinterpret_cast + (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; + param->signal = 1; + // Scale clock to 1024 to avoid 64 bit div in the scheduler + param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; + param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; + param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); + param->releaseHostCP = 0; + param->parentAQL = vmParentWrap; + param->dedicatedQueue = dev().settings().useDeviceQueue_; + + // Fill the scratch buffer information + if (hsaKernel.prog().maxScratchRegs() > 0) { + gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0]; + param->scratchSize = scratchBuf->size(); + param->scratch = scratchBuf->vmAddress(); + param->numMaxWaves = 32 * dev().info().maxComputeUnits_; + param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; + memList.push_back(scratchBuf); + } + else { + param->numMaxWaves = 0; + param->scratchSize = 0; + param->scratch = 0; + param->scratchOffset = 0; + } + + // Add all kernels in the program to the mem list. + //! \note Runtime doesn't know which one will be called + hsaKernel.prog().fillResListWithKernels(memList); + + // Add GSL handle to the memory list for VidMM + for (uint i = 0; i < memList.size(); ++i) { + gpuDefQueue->addVmMemory(memList[i]); + } + + mcaddr signalAddr = gpuDefQueue->schedParams_->vmAddress() + + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); + gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent, + gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, + signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / 32); + + // Set GPU event for the used resources + for (uint i = 0; i < memList.size(); ++i) { + memList[i]->setBusy(*gpuDefQueue, gpuEvent); + } + + if (dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue + virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + signalAddr, dev().settings().useDeviceQueue_); + } + + ++gpuDefQueue->schedParamIdx_ %= + gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); + //! \todo optimize the wrap around + if (gpuDefQueue->schedParamIdx_ == 0) { + gpuDefQueue->schedParams_->wait(*gpuDefQueue); + } + } // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { - memList[i]->setBusy(*gpuDefQueue, gpuEvent); + memList[i]->setBusy(*this, gpuEvent); } - if (dev().settings().useDeviceQueue_) { - // Add the termination handshake to the host queue - virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(), - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - signalAddr, dev().settings().useDeviceQueue_); + // Update the global GPU event + setGpuEvent(gpuEvent); + + if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { + LogError("Couldn't read printf data from the buffer!\n"); + return false; } - - ++gpuDefQueue->schedParamIdx_ %= - gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); - //! \todo optimize the wrap around - if (gpuDefQueue->schedParamIdx_ == 0) { - gpuDefQueue->schedParams_->wait(*gpuDefQueue); - } - } - - // Set GPU event for the used resources - for (uint i = 0; i < memList.size(); ++i) { - memList[i]->setBusy(*this, gpuEvent); - } - - // Update the global GPU event - setGpuEvent(gpuEvent); - - if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { - LogError("Couldn't read printf data from the buffer!\n"); - return false; } // Runtime submitted a HSAIL kernel diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp index beb76aa56f..cf941095bd 100644 --- a/projects/clr/rocclr/runtime/utils/flags.hpp +++ b/projects/clr/rocclr/runtime/utils/flags.hpp @@ -154,7 +154,7 @@ release(bool, GPU_DIRECT_SRD, false, \ "Use indirect SRD access in HSAIL") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ -release(bool, GPU_ENABLE_LARGE_ALLOCATION, false, \ +release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \ "Enable >4GB single allocations") \ release(bool, AMD_THREAD_TRACE_ENABLE, !IS_MAINLINE, \ "Enable thread trace extension") \