Added module api test with gcn binary

Change-Id: I61c3ecc2b34168d10f1a7b15d668630eb2c69c8c
Этот коммит содержится в:
Aditya Atluri
2016-08-23 13:50:19 -05:00
родитель 8f0f97f8f9
Коммит f9d49c2aed
2 изменённых файлов: 257 добавлений и 0 удалений
+257
Просмотреть файл
@@ -0,0 +1,257 @@
#include<hip_runtime.h>
#include<hip_runtime_api.h>
#include<iostream>
#include<fstream>
#include<hsa/hsa.h>
#include<hsa/amd_hsa_kernel_code.h>
#include<hsa/hsa_ven_amd_loader.h>
#include<vector>
#define LEN 64
#define SIZE LEN<<2
typedef hsa_code_object_t hipmodule;
typedef uint64_t hipfunction;
typedef unsigned int hipDevicePtr;
hsa_region_t systemRegion;
hsa_region_t kernArgRegion;
hsa_agent_t gpuAgent;
hsa_queue_t *Queue;
hsa_signal_t signal;
hsa_status_t findGpu(hsa_agent_t agent, void *data){
hsa_device_type_t device_type;
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
if(hsa_error_code != HSA_STATUS_SUCCESS){return hsa_error_code;}
if(device_type == HSA_DEVICE_TYPE_GPU){
gpuAgent = agent;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t FindRegions(hsa_region_t region, void *data){
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
return HSA_STATUS_SUCCESS;
}
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
systemRegion = region;
}
if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
kernArgRegion = region;
}
return HSA_STATUS_SUCCESS;
}
hipError_t ihipModuleLoad(hipmodule *module, const char *fname){
std::ifstream in(fname, std::ios::binary | std::ios::ate);
hipError_t ret = hipSuccess;
if(!in){
std::cout<<"Couldn't read file "<<fname<<std::endl;
ret = hipErrorInvalidValue;
}else{
size_t size = std::string::size_type(in.tellg());
void *p = NULL;
hsa_status_t status = hsa_memory_allocate(systemRegion, size, (void**)&p);
assert(status == HSA_STATUS_SUCCESS);
char *ptr = (char*)p;
if(!ptr){
std::cout<<"Error: failed to allocate memory for code object."<<std::endl;
}
in.seekg(0, std::ios::beg);
std::copy(std::istreambuf_iterator<char>(in),
std::istreambuf_iterator<char>(), ptr);
status = hsa_code_object_deserialize(ptr, size, NULL, module);
if (status != HSA_STATUS_SUCCESS) { std::cout<<"Failed to deserialize code object"<<std::endl; }
std::cout<<"File Read!"<<std::endl;
}
return ret;
}
hipError_t ihipModuleGetFunction(hipfunction *func, hipmodule hmod, const char *name){
hsa_status_t status;
hsa_executable_symbol_t kernel_symbol;
hsa_executable_t executable;
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_executable_load_code_object(executable, gpuAgent, hmod, NULL);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_executable_freeze(executable, NULL);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_executable_symbol_get_info(kernel_symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
func);
assert(status == HSA_STATUS_SUCCESS);
return hipSuccess;
}
void hsaInit(){
hsa_status_t status;
uint32_t queue_size;
status = hsa_iterate_agents(findGpu, NULL);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_agent_iterate_regions(gpuAgent, FindRegions, NULL);
assert(status == HSA_STATUS_SUCCESS);
char agent_name[64];
status = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_NAME, agent_name);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_queue_create(gpuAgent, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &Queue);
assert(status == HSA_STATUS_SUCCESS);
status = hsa_signal_create(1, 0, NULL, &signal);
assert(status == HSA_STATUS_SUCCESS);
std::cout<<agent_name<<std::endl;
}
hipError_t hipDrvLaunchKernel(hipfunction f,
uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, uint32_t size, void **extra){
void *kernarg;
std::vector<void*>Args;
void ***newP = (void***)kernelParams;
for(uint32_t i=0;i<size/sizeof(void*);i++){
Args.push_back((void*)*newP[i]);
}
hsa_status_t status = hsa_memory_allocate(kernArgRegion, size, &kernarg);
memcpy(kernarg, Args.data(), size);
const uint32_t queue_mask = Queue->size -1;
uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
dispatch_packet->completion_signal = signal;
dispatch_packet->workgroup_size_x = blockDimX;
dispatch_packet->workgroup_size_y = blockDimY;
dispatch_packet->workgroup_size_z = blockDimZ;
dispatch_packet->grid_size_x = blockDimX * gridDimX;
dispatch_packet->grid_size_y = blockDimY * gridDimY;
dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
dispatch_packet->group_segment_size = 0;
dispatch_packet->private_segment_size = sharedMemBytes;
dispatch_packet->kernarg_address = kernarg;
dispatch_packet->kernel_object = (uint64_t)f;
uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
uint32_t header32 = header | (setup << 16);
__atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
return hipSuccess;
}
#define fileName "vcpy_isa.co"
#define kernel_name "hello_world"
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
int tx = hipThreadIdx_x;
Bd[tx] = Ad[tx];
}
amd_kernel_code_t* getAkc(uint64_t handle){
bool ext_supported = false;
hsa_status_t status = hsa_system_extension_supported(
HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_supported);
assert(HSA_STATUS_SUCCESS == status);
assert(true == ext_supported);
hsa_ven_amd_loader_1_00_pfn_t ext_table = {nullptr};
status = hsa_system_get_extension_table(
HSA_EXTENSION_AMD_LOADER, 1, 0, &ext_table);
assert(HSA_STATUS_SUCCESS == status);
assert(nullptr != ext_table.hsa_ven_amd_loader_query_host_address);
std::cout<<"Start"<<std::endl;
const void *akc = nullptr;//new amd_kernel_code_t;
std::cout<<"End"<<std::endl;
status = ext_table.hsa_ven_amd_loader_query_host_address(
reinterpret_cast<void*>(handle), &akc);
if(HSA_STATUS_SUCCESS != status){
akc = reinterpret_cast<void*>(handle);
}
assert(nullptr!=akc);
amd_kernel_code_t *Akc = (amd_kernel_code_t*)akc;
std::cout<<Akc->kernarg_segment_byte_size<<std::endl;
return nullptr;
}
int main(){
float *A, *B, *Ad, *Bd;
hipDevicePtr Aptr, Bptr;
A = new float[LEN];
B = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 0.0f;
std::cout<<A[i] << " "<<B[i]<<std::endl;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
Aptr = reinterpret_cast<unsigned long>(Ad);
Bptr = reinterpret_cast<unsigned long>(Bd);
hsaInit();
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipModule Module;
hipFunction Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
hipStream_t stream;
hipStreamCreate(&stream);
void *args[2] = {&Ad, &Bd};
/* struct __attribute__((aligned(16))) args_t{
void *Aptr;
void *Bptr;
} args;
args.Aptr = Ad;
args.Bptr = Bd;
*/
// hipDrvLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, (void**)&args, sizeof(args), 0);
amd_kernel_code_t *akc = getAkc(Function.kernel);
std::vector<void*>argBuffer(2);
memcpy(&argBuffer[0], &Ad, sizeof(void*));
memcpy(&argBuffer[1], &Bd, sizeof(void*));
size_t size = argBuffer.size()*sizeof(void*);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END
};
hipLaunchModuleKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config);
hipStreamDestroy(stream);
// hipLaunchKernel(Cpy, dim3(1), dim3(LEN), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
for(uint32_t i=0;i<LEN;i++){
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
}
return 0;
}