Signed-off-by: Ben Goz <ben.goz@amd.com>


[ROCm/ROCR-Runtime commit: fb8378a18b]
Этот коммит содержится в:
Ben Goz
2015-08-23 17:42:27 +03:00
родитель 779c76a4e7
Коммит 9db147f2d4
9 изменённых файлов: 513 добавлений и 118 удалений
+14 -4
Просмотреть файл
@@ -249,7 +249,7 @@ struct kfd_ioctl_alloc_memory_of_gpu_args {
uint64_t size; /* to KFD */
uint64_t handle; /* from KFD */
uint32_t gpu_id; /* to KFD */
uint32_t pad;
uint64_t mmap_offset; /* from KFD */
};
struct kfd_ioctl_free_memory_of_gpu_args {
@@ -273,6 +273,12 @@ struct kfd_ioctl_open_graphic_handle_args {
uint32_t pad;
};
struct kfd_ioctl_set_process_dgpu_aperture_args {
uint32_t node_id;
uint64_t dgpu_base;
uint64_t dgpu_limit;
};
#define AMDKFD_IOCTL_BASE 'K'
#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr)
#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -342,13 +348,17 @@ struct kfd_ioctl_open_graphic_handle_args {
#define AMDKFD_IOC_OPEN_GRAPHIC_HANDLE \
AMDKFD_IOWR(0x15, struct kfd_ioctl_open_graphic_handle_args)
#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \
AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args)
#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \
AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args)
#define AMDKFD_IOC_SET_CU_MASK \
AMDKFD_IOW(0x17, struct kfd_ioctl_set_cu_mask_args)
#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE \
AMDKFD_IOW(0x18, struct kfd_ioctl_set_process_dgpu_aperture_args)
#define AMDKFD_COMMAND_START 0x01
#define AMDKFD_COMMAND_END 0x18
#define AMDKFD_COMMAND_END 0x19
#endif
+11
Просмотреть файл
@@ -30,7 +30,9 @@
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>
#include "linux/kfd_ioctl.h"
#include "fmm.h"
static HSAuint64 *events_page = NULL;
@@ -70,6 +72,15 @@ hsaKmtCreateEvent(
args.event_type = EventDesc->EventType;
args.auto_reset = !ManualReset;
/* dGPU code */
if (is_dgpu && events_page == NULL) {
events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, 0x9000);
if (!events_page) {
return HSAKMT_STATUS_ERROR;
}
fmm_get_handle(events_page, &args.event_page_offset);
}
if (kmtIoctl(kfd_fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) {
free(e);
*Event = NULL;
+298 -53
Просмотреть файл
@@ -92,10 +92,16 @@ typedef struct {
manageble_aperture_t scratch_aperture;
manageble_aperture_t scratch_physical;
manageble_aperture_t gpuvm_aperture;
manageble_aperture_t dgpu_aperture;
} gpu_mem_t;
static gpu_mem_t gpu_mem[] = INIT_GPUs_MEM;
static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit);
static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit);
static void __fmm_release(uint32_t gpu_id, void *address,
uint64_t MemorySizeInBytes, manageble_aperture_t *aperture);
static vm_area_t *vm_create_and_init_area(void *start, void *end)
{
vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t));
@@ -373,45 +379,24 @@ static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id)
return -1;
}
static manageble_aperture_t *find_valid_gpuvm_apperture_of_gpu(uint32_t gpu_id)
{
manageble_aperture_t *aperture;
int32_t gpu_mem_id;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
/* Check that aperture is properly initialized/supported */
if (!aperture_is_valid(aperture->base, aperture->limit))
return NULL;
return aperture;
}
static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem,
uint64_t MemorySizeInBytes)
uint64_t MemorySizeInBytes,
manageble_aperture_t *aperture,
uint64_t *mmap_offset)
{
struct kfd_ioctl_alloc_memory_of_gpu_args args;
struct kfd_ioctl_free_memory_of_gpu_args free_args;
manageble_aperture_t *aperture;
if (!mem)
return -1;
/* Retrieve gpuvm aperture according to gpu_id */
aperture = find_valid_gpuvm_apperture_of_gpu(gpu_id);
if (!aperture)
return -1;
/* Allocate memory from amdkfd */
args.gpu_id = gpu_id;
args.size = MemorySizeInBytes;
args.va_addr = VOID_PTRS_SUB(mem, aperture->base);
args.va_addr = (uint64_t)mem;
if (!mmap_offset)
args.va_addr = VOID_PTRS_SUB(mem, aperture->base);
if (kmtIoctl(kfd_fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, &args))
return -1;
@@ -423,6 +408,9 @@ static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem,
goto err_object_allocation_failed;
pthread_mutex_unlock(&aperture->fmm_mutex);
if (mmap_offset)
*mmap_offset = args.mmap_offset;
return 0;
err_object_allocation_failed:
@@ -541,24 +529,10 @@ void *fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes)
return (void*)(((((uint64_t)mem) >> 16) + 1) << 16);
}
/*
* The offset from GPUVM aperture base address to ensure that address 0
* (after base subtraction) won't be used
*/
#define GPUVM_APP_OFFSET 0x10000
void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
static void* __fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes,
manageble_aperture_t *aperture, uint64_t offset, uint64_t *mmap_offset)
{
manageble_aperture_t *aperture;
int32_t gpu_mem_id;
void *mem = NULL;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
/* Check that aperture is properly initialized/supported */
if (!aperture_is_valid(aperture->base, aperture->limit))
return NULL;
@@ -566,14 +540,15 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area(aperture,
MemorySizeInBytes, GPUVM_APP_OFFSET);
MemorySizeInBytes, offset);
pthread_mutex_unlock(&aperture->fmm_mutex);
/*
* Now that we have the area reserved, allocate memory in the device
* itself
*/
if (fmm_allocate_memory_in_device(gpu_id, mem, MemorySizeInBytes)) {
if (fmm_allocate_memory_in_device(gpu_id, mem,
MemorySizeInBytes, aperture, mmap_offset)) {
/*
* allocation of memory in device failed.
* Release region in aperture
@@ -589,6 +564,89 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
return mem;
}
/*
* The offset from GPUVM aperture base address to ensure that address 0
* (after base subtraction) won't be used
*/
#define GPUVM_APP_OFFSET 0x10000
void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
{
manageble_aperture_t *aperture;
int32_t gpu_mem_id;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
return __fmm_allocate_device(gpu_id, MemorySizeInBytes,
aperture, GPUVM_APP_OFFSET, NULL);
}
static void* fmm_allocate_host_cpu(uint32_t gpu_id,
uint64_t MemorySizeInBytes, HsaMemFlags flags)
{
int err;
HSAuint64 page_size;
void *mem = NULL;
page_size = PageSizeFromFlags(flags.ui32.PageSize);
err = posix_memalign(&mem, page_size, MemorySizeInBytes);
if (err != 0)
return NULL;
if (flags.ui32.ExecuteAccess) {
err = mprotect(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (err != 0) {
free(mem);
return NULL;
}
}
return mem;
}
static void* fmm_allocate_host_gpu(uint32_t gpu_id,
uint64_t MemorySizeInBytes, HsaMemFlags flags)
{
void *mem;
manageble_aperture_t *aperture;
int32_t gpu_mem_id;
uint64_t mmap_offset;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
aperture = &gpu_mem[gpu_mem_id].dgpu_aperture;
MemorySizeInBytes += 0x8000 - (MemorySizeInBytes % 0x8000);
mem = __fmm_allocate_device(gpu_id, MemorySizeInBytes,
aperture, 0, &mmap_offset);
void *ret = mmap(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(gpu_id, mem, MemorySizeInBytes, aperture);
return NULL;
}
return ret;
}
void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags, uint16_t dev_id)
{
if (topology_is_dgpu(dev_id))
return fmm_allocate_host_gpu(gpu_id, MemorySizeInBytes, flags);
return fmm_allocate_host_cpu(gpu_id, MemorySizeInBytes, flags);
}
void *fmm_open_graphic_handle(uint32_t gpu_id,
int32_t graphic_device_handle,
uint32_t graphic_handle,
@@ -647,20 +705,14 @@ out:
}
static void __fmm_release(uint32_t gpu_id, void *address,
uint64_t MemorySizeInBytes)
uint64_t MemorySizeInBytes, manageble_aperture_t *aperture)
{
struct kfd_ioctl_free_memory_of_gpu_args args;
manageble_aperture_t *aperture;
vm_object_t *object;
if (!address)
return;
/* Retrieve gpuvm aperture according to gpu_id */
aperture = find_valid_gpuvm_apperture_of_gpu(gpu_id);
if (!aperture)
return;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
@@ -696,7 +748,16 @@ void fmm_release(void *address, uint64_t MemorySizeInBytes)
if (address >= gpu_mem[i].gpuvm_aperture.base &&
address <= gpu_mem[i].gpuvm_aperture.limit) {
found = true;
__fmm_release(gpu_mem[i].gpu_id, address, MemorySizeInBytes);
__fmm_release(gpu_mem[i].gpu_id, address,
MemorySizeInBytes, &gpu_mem[i].gpuvm_aperture);
fmm_print(gpu_mem[i].gpu_id);
}
if (address >= gpu_mem[i].dgpu_aperture.base &&
address <= gpu_mem[i].dgpu_aperture.limit) {
found = true;
__fmm_release(gpu_mem[i].gpu_id, address,
MemorySizeInBytes, &gpu_mem[i].dgpu_aperture);
fmm_print(gpu_mem[i].gpu_id);
}
}
@@ -713,6 +774,8 @@ HSAKMT_STATUS fmm_init_process_apertures(void)
{
struct kfd_ioctl_get_process_apertures_args args;
uint8_t node_id;
uint32_t gpu_id;
HsaNodeProperties props;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES, (void *) &args))
return HSAKMT_STATUS_ERROR;
@@ -721,6 +784,17 @@ HSAKMT_STATUS fmm_init_process_apertures(void)
gpu_mem[node_id].gpu_id =
args.process_apertures[node_id].gpu_id;
if (topology_sysfs_get_node_props(node_id, &props, &gpu_id) ==
HSAKMT_STATUS_SUCCESS) {
if (topology_is_dgpu(props.DeviceId)) {
dgpu_mem_init(node_id, &gpu_mem[node_id].dgpu_aperture.base,
&gpu_mem[node_id].dgpu_aperture.limit);
set_dgpu_aperture(node_id, (uint64_t)gpu_mem[node_id].dgpu_aperture.base,
(uint64_t)gpu_mem[node_id].dgpu_aperture.limit);
}
}
gpu_mem[node_id].lds_aperture.base =
PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_base);
@@ -804,6 +878,34 @@ HSAuint64 fmm_get_aperture_base(aperture_type_e aperture_type, HSAuint32 gpu_id)
}
}
static int _fmm_map_to_gpu_gtt(uint32_t gpu_id, manageble_aperture_t *aperture,
void *address, uint64_t size)
{
struct kfd_ioctl_map_memory_to_gpu_args args;
vm_object_t *object;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (!object) {
goto err_object_not_found;
}
args.handle = object->handle;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args))
goto err_map_ioctl_failed;
pthread_mutex_unlock(&aperture->fmm_mutex);
return 0;
err_map_ioctl_failed:
err_object_not_found:
pthread_mutex_unlock(&aperture->fmm_mutex);
return -1;
}
static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture,
void *address, uint64_t size,
uint64_t *gpuvm_address)
@@ -855,6 +957,12 @@ int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address)
return _fmm_map_to_gpu(gpu_mem[i].gpu_id,
&gpu_mem[i].gpuvm_aperture,
address, size, gpuvm_address);
if ((address >= gpu_mem[i].dgpu_aperture.base) &&
(address <= gpu_mem[i].dgpu_aperture.limit))
/* map it */
return _fmm_map_to_gpu_gtt(gpu_mem[i].gpu_id,
&gpu_mem[i].dgpu_aperture,
address, size);
}
/*
@@ -904,7 +1012,144 @@ int fmm_unmap_from_gpu(void *address)
/* unmap it */
return _fmm_unmap_from_gpu(&gpu_mem[i].gpuvm_aperture,
address);
else if ((address >= gpu_mem[i].dgpu_aperture.base) &&
(address <= gpu_mem[i].dgpu_aperture.limit))
/* unmap it */
return _fmm_unmap_from_gpu(&gpu_mem[i].dgpu_aperture,
address);
}
return 0;
}
/* Tonga dGPU specific functions */
static bool is_dgpu_mem_init = false;
static void *dgpu_shared_aperture_base = NULL;
static void *dgpu_shared_aperture_limit = NULL;
static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit)
{
struct kfd_ioctl_set_process_dgpu_aperture_args args;
args.node_id = node_id;
args.dgpu_base = base;
args.dgpu_limit = limit;
return kmtIoctl(kfd_fd, AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, &args);
}
static void *reserve_address(void *addr, long long unsigned int len)
{
void *ret_addr;
if (len <= 0)
return NULL;
ret_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED)
return NULL;
return ret_addr;
}
#define ADDRESS_RANGE_LIMIT_MASK 0xFFFFFFFFFF
static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit)
{
bool found;
HSAKMT_STATUS ret;
void *addr, *ret_addr;
uint32_t max_len;
long long unsigned int temp;
uint32_t gpu_id;
HsaNodeProperties props;
if (is_dgpu_mem_init) {
if (base)
base = dgpu_shared_aperture_base;
if (limit)
limit = dgpu_shared_aperture_limit;
return HSAKMT_STATUS_SUCCESS;
}
ret = topology_sysfs_get_node_props(node_id, &props, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
max_len = (uint32_t)props.LocalMemSize;
found = false;
for (addr = (void *)PAGE_SIZE, ret_addr = NULL;
ret_addr != addr;
addr = (void *)((unsigned long)addr + 0x8000))
{
ret_addr = reserve_address(addr, max_len);
if (!ret_addr)
continue;
temp = (long long unsigned int)ret_addr + max_len;
if (temp < ADDRESS_RANGE_LIMIT_MASK) {
found = true;
break;
}
else
munmap(ret_addr, max_len);
}
if (found) {
if (base)
*base = ret_addr;
dgpu_shared_aperture_base = ret_addr;
if (limit)
*limit = (void *)((long long unsigned int)ret_addr + max_len);
dgpu_shared_aperture_limit = (void *)((long long unsigned int)ret_addr + max_len);
is_dgpu_mem_init = true;
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_ERROR;
}
bool fmm_get_handle(void *address, uint64_t *handle)
{
int32_t i;
manageble_aperture_t *aperture;
vm_object_t *object;
bool found;
found = false;
aperture = NULL;
/* Find the aperture the requested address belongs to */
for (i = 0; i < NUM_OF_SUPPORTED_GPUS; i++) {
if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
continue;
if ((address >= gpu_mem[i].gpuvm_aperture.base) &&
(address <= gpu_mem[i].gpuvm_aperture.limit)) {
aperture = &gpu_mem[i].gpuvm_aperture;
break;
}
else if ((address >= gpu_mem[i].dgpu_aperture.base) &&
(address <= gpu_mem[i].dgpu_aperture.limit)) {
aperture = &gpu_mem[i].dgpu_aperture;
break;
}
}
if (!aperture)
return false;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (object && handle) {
*handle = object->handle;
found = true;
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return found;
}
+3
Просмотреть файл
@@ -49,6 +49,8 @@ HSAKMT_STATUS fmm_init_process_apertures(void);
*/
void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes);
void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes);
void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes,
HsaMemFlags flags, uint16_t dev_id);
void* fmm_open_graphic_handle(uint32_t gpu_id,
int32_t graphic_device_handle,
uint32_t graphic_handle,
@@ -58,6 +60,7 @@ bool fmm_is_inside_some_aperture(void* address);
void fmm_release(void* address, HSAuint64 MemorySizeInBytes);
int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
int fmm_unmap_from_gpu(void *address);
bool fmm_get_handle(void *address, uint64_t *handle);
/* Topology interface*/
HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id);
+1
Просмотреть файл
@@ -31,3 +31,4 @@ int kfd_fd;
unsigned long kfd_open_count;
unsigned long system_properties_count;
pthread_mutex_t hsakmt_mutex = PTHREAD_MUTEX_INITIALIZER;
bool is_dgpu = false;
+10
Просмотреть файл
@@ -34,6 +34,7 @@
extern int kfd_fd;
extern unsigned long kfd_open_count;
extern pthread_mutex_t hsakmt_mutex;
extern bool is_dgpu;
#undef HSAKMTAPI
#define HSAKMTAPI __attribute__((visibility ("default")))
@@ -65,6 +66,15 @@ HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
uint16_t get_device_id_by_node(HSAuint32 node_id);
HSAKMT_STATUS topology_sysfs_get_gpu_id(uint32_t node_id, uint32_t *gpu_id);
HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties *props, uint32_t *gpu_id);
bool topology_is_dgpu(uint16_t gpu_id);
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align);
void free_exec_aligned_memory_gpu(void *addr, uint32_t size);
extern int kmtIoctl(int fd, unsigned long request, void *arg);
/* Void pointer arithmetic (or remove -Wpointer-arith to allow void pointers arithmetic) */
+10 -18
Просмотреть файл
@@ -86,7 +86,7 @@ hsaKmtSetMemoryPolicy(
return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS;
}
static HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags)
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags)
{
switch (pageSizeFlags) {
case HSA_PAGE_SIZE_4KB: return 4*1024;
@@ -109,9 +109,8 @@ hsaKmtAllocMemory(
)
{
HSAKMT_STATUS result;
HSAuint64 page_size;
uint32_t gpu_id;
int err;
HSAuint64 page_size;
CHECK_KFD_OPEN();
@@ -119,26 +118,18 @@ hsaKmtAllocMemory(
if (result != HSAKMT_STATUS_SUCCESS)
return result;
/* The required size should be page aligned (GDS?) */
page_size = PageSizeFromFlags(MemFlags.ui32.PageSize);
if ((!MemoryAddress) || (!SizeInBytes) ||
(SizeInBytes & (page_size-1)))
(SizeInBytes & (page_size-1))) {
return HSAKMT_STATUS_INVALID_PARAMETER;
}
if (MemFlags.ui32.HostAccess && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
err = posix_memalign(MemoryAddress, page_size, SizeInBytes);
if (err != 0)
return HSAKMT_STATUS_NO_MEMORY;
if (MemFlags.ui32.ExecuteAccess) {
err = mprotect(*MemoryAddress, SizeInBytes,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (err != 0) {
free(*MemoryAddress);
return err;
}
}
*MemoryAddress = fmm_allocate_host(gpu_id, SizeInBytes, MemFlags,
get_device_id_by_node(PreferredNode));
if (*MemoryAddress == NULL)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
@@ -224,6 +215,7 @@ hsaKmtUnmapMemoryToGPU(
)
{
CHECK_KFD_OPEN();
if (!fmm_unmap_from_gpu(MemoryAddress))
return HSAKMT_STATUS_SUCCESS;
else
+115 -14
Просмотреть файл
@@ -34,25 +34,42 @@
#include <sys/mman.h>
#include <fcntl.h>
#define TONGA_PAGE_SIZE 0x9000
/* 1024 doorbells, 4 bytes each doorbell */
#define DOORBELLS_PAGE_SIZE 1024 * 4
enum asic_family_type {
CHIP_KAVERI = 0,
CHIP_CARRIZO,
CHIP_TONGA
};
struct device_info
{
enum asic_family_type asic_family;
uint32_t ctx_save_restore_size;
uint32_t eop_buffer_size;
};
struct device_info kaveri_device_info = {
.asic_family = CHIP_KAVERI,
.ctx_save_restore_size = 0,
.eop_buffer_size = 0,
};
struct device_info carrizo_device_info = {
.asic_family = CHIP_CARRIZO,
.ctx_save_restore_size = 2756608,
.eop_buffer_size = 4096,
};
struct device_info tonga_device_info = {
.asic_family = CHIP_TONGA,
.ctx_save_restore_size = TONGA_PAGE_SIZE,
.eop_buffer_size = TONGA_PAGE_SIZE,
};
struct device_id
{
uint16_t dev_id;
@@ -87,6 +104,8 @@ struct device_id supported_devices[] = {
{ 0x9875, &carrizo_device_info }, /* Carrizo */
{ 0x9876, &carrizo_device_info }, /* Carrizo */
{ 0x9877, &carrizo_device_info }, /* Carrizo */
{ 0x6939, &tonga_device_info },
{ 0x692b, &tonga_device_info },
{ 0, NULL }
};
@@ -97,6 +116,7 @@ struct queue
uint32_t rptr;
void *eop_buffer;
void *ctx_save_restore;
enum asic_family_type type;
};
struct process_doorbells
@@ -121,7 +141,7 @@ static struct device_info *get_device_info_by_dev_id(uint16_t dev_id)
return NULL;
}
static void free_queue(struct queue *q)
static void free_queue_cpu(struct queue *q)
{
if (q->eop_buffer)
free(q->eop_buffer);
@@ -130,7 +150,7 @@ static void free_queue(struct queue *q)
free(q);
}
static void* allocate_exec_aligned_memory(uint32_t size, uint32_t align)
static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align)
{
void *ptr;
int retval;
@@ -149,13 +169,89 @@ static void* allocate_exec_aligned_memory(uint32_t size, uint32_t align)
return ptr;
}
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align)
{
void *mem;
HSAuint64 gpu_va;
HsaMemFlags flags;
HSAKMT_STATUS ret;
flags.Value = 0;
flags.ui32.HostAccess = 1;
flags.ui32.ExecuteAccess = 1;
flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
size += align - (size % align);
ret = hsaKmtAllocMemory(0, size, flags, &mem);
if (ret != HSAKMT_STATUS_SUCCESS) {
return NULL;
}
if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) {
hsaKmtFreeMemory(mem, size);
return NULL;
}
return mem;
}
void free_exec_aligned_memory_gpu(void *addr, uint32_t size)
{
size += TONGA_PAGE_SIZE - (size % TONGA_PAGE_SIZE);
if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) {
hsaKmtFreeMemory(addr, size);
}
}
static void* allocate_exec_aligned_memory(uint32_t size, uint32_t align, enum asic_family_type type)
{
if (type == CHIP_TONGA)
return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE);
return allocate_exec_aligned_memory_cpu(size, align);
}
static void release_exec_aligned_memory_gpu(void *addr, uint32_t size)
{
if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS)
hsaKmtFreeMemory(addr, (HSAuint64)size);
}
static void release_exec_aligned_memory(void *addr, uint32_t size, enum asic_family_type type)
{
if (type == CHIP_TONGA)
release_exec_aligned_memory_gpu(addr, TONGA_PAGE_SIZE);
else
free(addr);
}
static void free_queue_gpu(struct queue *q)
{
if (q->eop_buffer) {
hsaKmtUnmapMemoryToGPU(q->eop_buffer);
hsaKmtFreeMemory(q->eop_buffer, TONGA_PAGE_SIZE);
}
if (q->ctx_save_restore) {
hsaKmtUnmapMemoryToGPU(q->ctx_save_restore);
hsaKmtFreeMemory(q->ctx_save_restore, TONGA_PAGE_SIZE);
}
release_exec_aligned_memory((void *)q, sizeof(*q), q->type);
}
static void free_queue(struct queue *q, enum asic_family_type type)
{
if (type == CHIP_TONGA)
return free_queue_gpu(q);
return free_queue_cpu(q);
}
static int handle_concrete_asic(struct device_info *dev_info, struct queue *q,
struct kfd_ioctl_create_queue_args *args)
{
if (dev_info) {
if (dev_info->eop_buffer_size > 0) {
q->eop_buffer =
allocate_exec_aligned_memory(dev_info->eop_buffer_size, PAGE_SIZE);
allocate_exec_aligned_memory(dev_info->eop_buffer_size, PAGE_SIZE, dev_info->asic_family);
if (q->eop_buffer == NULL) {
return HSAKMT_STATUS_NO_MEMORY;
}
@@ -165,7 +261,7 @@ static int handle_concrete_asic(struct device_info *dev_info, struct queue *q,
if (dev_info->ctx_save_restore_size > 0) {
args->ctx_save_restore_size = dev_info->ctx_save_restore_size;
q->ctx_save_restore =
allocate_exec_aligned_memory(dev_info->ctx_save_restore_size, PAGE_SIZE);
allocate_exec_aligned_memory(dev_info->ctx_save_restore_size, PAGE_SIZE, dev_info->asic_family);
if (q->ctx_save_restore == NULL) {;
return HSAKMT_STATUS_NO_MEMORY;
}
@@ -201,30 +297,35 @@ hsaKmtCreateQueue(
if (result != HSAKMT_STATUS_SUCCESS)
return result;
struct queue *q = malloc(sizeof(*q));
dev_id = get_device_id_by_node(NodeId);
dev_info = get_device_info_by_dev_id(dev_id);
struct queue *q = allocate_exec_aligned_memory(sizeof (*q),
PAGE_SIZE, dev_info->asic_family);
if (q == NULL)
return HSAKMT_STATUS_NO_MEMORY;
memset(q, 0, sizeof(*q));
struct kfd_ioctl_create_queue_args args;
memset(&args, 0, sizeof(args));
dev_id = get_device_id_by_node(NodeId);
dev_info = get_device_info_by_dev_id(dev_id);
args.gpu_id = gpu_id;
q->type = dev_info->asic_family;
err = handle_concrete_asic(dev_info, q, &args);
if (err != HSAKMT_STATUS_SUCCESS) {
free_queue(q);
free_queue(q, dev_info->asic_family);
return err;
}
switch (Type)
{
case HSA_QUEUE_COMPUTE: args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE; break;
case HSA_QUEUE_SDMA: free_queue(q); return HSAKMT_STATUS_UNAVAILABLE;
case HSA_QUEUE_SDMA: free_queue(q, dev_info->asic_family); return HSAKMT_STATUS_UNAVAILABLE;
case HSA_QUEUE_COMPUTE_AQL: args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE_AQL; break;
default: free_queue(q); return HSAKMT_STATUS_INVALID_PARAMETER;
default: free_queue(q, dev_info->asic_family); return HSAKMT_STATUS_INVALID_PARAMETER;
}
if (Type != HSA_QUEUE_COMPUTE_AQL)
@@ -244,7 +345,7 @@ hsaKmtCreateQueue(
if (err == -1)
{
free_queue(q);
free_queue(q, dev_info->asic_family);
return HSAKMT_STATUS_ERROR;
}
@@ -259,7 +360,7 @@ hsaKmtCreateQueue(
if (ptr == MAP_FAILED) {
pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex);
hsaKmtDestroyQueue(q->queue_id);
free_queue(q);
free_queue(q, dev_info->asic_family);
return HSAKMT_STATUS_ERROR;
}
@@ -321,7 +422,7 @@ hsaKmtDestroyQueue(
struct kfd_ioctl_destroy_queue_args args;
if (q == NULL)
return (HSAKMT_STATUS_INVALID_PARAMETER);
return (HSAKMT_STATUS_INVALID_PARAMETER);
memset(&args, 0, sizeof(args));
@@ -335,7 +436,7 @@ hsaKmtDestroyQueue(
}
else
{
free_queue(q);
free_queue(q, q->type);
return HSAKMT_STATUS_SUCCESS;
}
}
+51 -29
Просмотреть файл
@@ -59,36 +59,40 @@ static struct hsa_gfxip_table {
unsigned char major; // GFXIP Major engine version
unsigned char minor; // GFXIP Minor engine version
unsigned char stepping; // GFXIP Stepping info
unsigned char is_dgpu; // Predicat for dGPU devices
} gfxip_lookup_table[] = {
/* Kaveri Family */
{ 0x1304, 7, 0, 0 },
{ 0x1305, 7, 0, 0 },
{ 0x1306, 7, 0, 0 },
{ 0x1307, 7, 0, 0 },
{ 0x1309, 7, 0, 0 },
{ 0x130A, 7, 0, 0 },
{ 0x130B, 7, 0, 0 },
{ 0x130C, 7, 0, 0 },
{ 0x130D, 7, 0, 0 },
{ 0x130E, 7, 0, 0 },
{ 0x130F, 7, 0, 0 },
{ 0x1310, 7, 0, 0 },
{ 0x1311, 7, 0, 0 },
{ 0x1312, 7, 0, 0 },
{ 0x1313, 7, 0, 0 },
{ 0x1315, 7, 0, 0 },
{ 0x1316, 7, 0, 0 },
{ 0x1317, 7, 0, 0 },
{ 0x1318, 7, 0, 0 },
{ 0x131B, 7, 0, 0 },
{ 0x131C, 7, 0, 0 },
{ 0x131D, 7, 0, 0 },
{ 0x1304, 7, 0, 0, 0 },
{ 0x1305, 7, 0, 0, 0 },
{ 0x1306, 7, 0, 0, 0 },
{ 0x1307, 7, 0, 0, 0 },
{ 0x1309, 7, 0, 0, 0 },
{ 0x130A, 7, 0, 0, 0 },
{ 0x130B, 7, 0, 0, 0 },
{ 0x130C, 7, 0, 0, 0 },
{ 0x130D, 7, 0, 0, 0 },
{ 0x130E, 7, 0, 0, 0 },
{ 0x130F, 7, 0, 0, 0 },
{ 0x1310, 7, 0, 0, 0 },
{ 0x1311, 7, 0, 0, 0 },
{ 0x1312, 7, 0, 0, 0 },
{ 0x1313, 7, 0, 0, 0 },
{ 0x1315, 7, 0, 0, 0 },
{ 0x1316, 7, 0, 0, 0 },
{ 0x1317, 7, 0, 0, 0 },
{ 0x1318, 7, 0, 0, 0 },
{ 0x131B, 7, 0, 0, 0 },
{ 0x131C, 7, 0, 0, 0 },
{ 0x131D, 7, 0, 0, 0 },
/* Carrizo Family */
{ 0x9870, 8, 0, 1 },
{ 0x9874, 8, 0, 1 },
{ 0x9875, 8, 0, 1 },
{ 0x9876, 8, 0, 1 },
{ 0x9877, 8, 0, 1 }
{ 0x9870, 8, 0, 1, 0 },
{ 0x9874, 8, 0, 1, 0 },
{ 0x9875, 8, 0, 1, 0 },
{ 0x9876, 8, 0, 1, 0 },
{ 0x9877, 8, 0, 1, 0 },
/* Tonga Family */
{ 0x6939, 8, 0, 0, 1 },
{ 0x692b, 8, 0, 0, 1 }
};
static void
@@ -203,7 +207,7 @@ err1:
return ret;
}
static HSAKMT_STATUS
HSAKMT_STATUS
topology_sysfs_get_gpu_id(uint32_t node_id, uint32_t *gpu_id) {
FILE *fd;
char path[256];
@@ -222,7 +226,25 @@ topology_sysfs_get_gpu_id(uint32_t node_id, uint32_t *gpu_id) {
return ret;
}
static HSAKMT_STATUS
bool topology_is_dgpu(uint16_t gpu_id)
{
uint32_t i, table_size;
if (is_dgpu)
return is_dgpu;
table_size = sizeof(gfxip_lookup_table)/sizeof(struct hsa_gfxip_table);
for (i=0; i<table_size; i++) {
if(gfxip_lookup_table[i].device_id == gpu_id && gfxip_lookup_table[i].is_dgpu == 1) {
is_dgpu = true;
return true;
}
}
return false;
}
HSAKMT_STATUS
topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties *props, uint32_t *gpu_id) {
FILE *fd;
char *read_buf, *p;