#include #include #include #include #include #include #include #include #define LEN 64 #define SIZE LEN<<2 typedef hsa_code_object_t hipmodule; typedef uint64_t hipfunction; typedef unsigned int hipDevicePtr; hsa_region_t systemRegion; hsa_region_t kernArgRegion; hsa_agent_t gpuAgent; hsa_queue_t *Queue; hsa_signal_t signal; hsa_status_t findGpu(hsa_agent_t agent, void *data){ hsa_device_type_t device_type; hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); if(hsa_error_code != HSA_STATUS_SUCCESS){return hsa_error_code;} if(device_type == HSA_DEVICE_TYPE_GPU){ gpuAgent = agent; } return HSA_STATUS_SUCCESS; } hsa_status_t FindRegions(hsa_region_t region, void *data){ hsa_region_segment_t segment_id; hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); if (segment_id != HSA_REGION_SEGMENT_GLOBAL) { return HSA_STATUS_SUCCESS; } hsa_region_global_flag_t flags; hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){ systemRegion = region; } if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){ kernArgRegion = region; } return HSA_STATUS_SUCCESS; } hipError_t ihipModuleLoad(hipmodule *module, const char *fname){ std::ifstream in(fname, std::ios::binary | std::ios::ate); hipError_t ret = hipSuccess; if(!in){ std::cout<<"Couldn't read file "<(in), std::istreambuf_iterator(), ptr); status = hsa_code_object_deserialize(ptr, size, NULL, module); if (status != HSA_STATUS_SUCCESS) { std::cout<<"Failed to deserialize code object"<Args; void ***newP = (void***)kernelParams; for(uint32_t i=0;isize -1; uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue); hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]); dispatch_packet->completion_signal = signal; dispatch_packet->workgroup_size_x = blockDimX; dispatch_packet->workgroup_size_y = blockDimY; dispatch_packet->workgroup_size_z = blockDimZ; dispatch_packet->grid_size_x = blockDimX * gridDimX; dispatch_packet->grid_size_y = blockDimY * gridDimY; dispatch_packet->grid_size_z = blockDimZ * gridDimZ; dispatch_packet->group_segment_size = 0; dispatch_packet->private_segment_size = sharedMemBytes; dispatch_packet->kernarg_address = kernarg; dispatch_packet->kernel_object = (uint64_t)f; uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; uint32_t header32 = header | (setup << 16); __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE); hsa_queue_store_write_index_relaxed(Queue, packet_index+1); hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index); hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); return hipSuccess; } #define fileName "vcpy_isa.co" #define kernel_name "hello_world" __global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){ int tx = hipThreadIdx_x; Bd[tx] = Ad[tx]; } amd_kernel_code_t* getAkc(uint64_t handle){ bool ext_supported = false; hsa_status_t status = hsa_system_extension_supported( HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_supported); assert(HSA_STATUS_SUCCESS == status); assert(true == ext_supported); hsa_ven_amd_loader_1_00_pfn_t ext_table = {nullptr}; status = hsa_system_get_extension_table( HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_table); assert(HSA_STATUS_SUCCESS == status); assert(nullptr != ext_table.hsa_ven_amd_loader_query_host_address); std::cout<<"Start"<(handle), &akc); if(HSA_STATUS_SUCCESS != status){ akc = reinterpret_cast(handle); } assert(nullptr!=akc); amd_kernel_code_t *Akc = (amd_kernel_code_t*)akc; std::cout<kernarg_segment_byte_size<(Ad); Bptr = reinterpret_cast(Bd); hsaInit(); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice); hipModule Module; hipFunction Function; hipModuleLoad(&Module, fileName); hipModuleGetFunction(&Function, Module, kernel_name); hipStream_t stream; hipStreamCreate(&stream); void *args[2] = {&Ad, &Bd}; /* struct __attribute__((aligned(16))) args_t{ void *Aptr; void *Bptr; } args; args.Aptr = Ad; args.Bptr = Bd; */ // hipDrvLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, (void**)&args, sizeof(args), 0); amd_kernel_code_t *akc = getAkc(Function.kernel); std::vectorargBuffer(2); memcpy(&argBuffer[0], &Ad, sizeof(void*)); memcpy(&argBuffer[1], &Bd, sizeof(void*)); size_t size = argBuffer.size()*sizeof(void*); void *config[] = { HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END }; hipLaunchModuleKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config); hipStreamDestroy(stream); // hipLaunchKernel(Cpy, dim3(1), dim3(LEN), 0, 0, Ad, Bd); hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); for(uint32_t i=0;i