Merge branch 'amd-develop' into amd-master

Change-Id: I28abeb5f0c81dd0315dcb1c987f22704a9c0d18b
This commit is contained in:
Maneesh Gupta
2016-09-01 16:06:01 +05:30
38 fájl változott, egészen pontosan 3237 új sor hozzáadva és 1030 régi sor törölve
+4
Fájl megtekintése
@@ -117,6 +117,10 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl
- Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs.
- FIXME refers to a short-term bug that needs to be addressed.
- HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized,
and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match
those of the parent fucntion.
#### Presubmit Testing:
Before checking in or submitting a pull request, run all Rodinia tests and ensure pass results match starting point:
+36 -3
Fájl megtekintése
@@ -105,7 +105,7 @@ if ($HIP_PLATFORM eq "hcc") {
}
# Satisfy HCC dependencies
$HIPLDFLAGS .= " -lc++abi";
$HIPLDFLAGS .= " -lc++abi -lsupc++";
$HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt";
# Handle ROCm target platform
@@ -151,13 +151,12 @@ if ($HIP_PLATFORM eq "hcc") {
$HIPCC="$CUDA_PATH/bin/nvcc";
$HIPCXXFLAGS .= " -I$CUDA_PATH/include";
$HIPLDFLAGS = "";
$HIPLDFLAGS = "-lcuda -lcudart";
} else {
printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'");
exit (-1);
}
# Add paths to common HIP includes:
$HIPCXXFLAGS .= " -I$HIP_PATH/include" ;
@@ -177,7 +176,41 @@ if ($verbose & 0x4) {
print "hipcc-args: ", join (" ", @ARGV), "\n";
}
# Handle code object generation
my $ISACMD="";
if($HIP_PLATFORM eq "hcc"){
$ISACMD .= "$HIP_PATH/bin/hipgenisa.sh ";
$ISACMD .= $ROCM_PATH;
if($ARGV[0] eq "--gencodeobject"){
foreach $isaarg (@ARGV[1..$#ARGV]){
$ISACMD .= " ";
$ISACMD .= $isaarg;
}
if ($verbose & 0x1) {
print "hipcc-cmd: ", $ISACMD, "\n";
}
system($ISACMD) and die();
exit(0);
}
}
if($HIP_PLATFORM eq "nvcc"){
$ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
if($ARGV[0] eq "--gencodeobject"){
foreach $isaarg (@ARGV[1..$#ARGV]){
$ISACMD .= " ";
$ISACMD .= $isaarg;
}
if ($verbose & 0x1) {
print "hipcc-cmd: ", $ISACMD, "\n";
}
system($ISACMD) and die();
exit(0);
}
}
my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
foreach $arg (@ARGV)
{
my $swallowArg = 0;
+31
Fájl megtekintése
@@ -0,0 +1,31 @@
#!/bin/bash
if [ $1 = " " ]
then
exit
fi
ROCM_PATH=$1
GEN_ISA=$2
FILE_NAMES=$3
OUT=$4
OUTPUT_FILE=$5
TARGET=""
if [ ${GEN_ISA:0:12} = "--target-isa" ]
then
TARGET=${GEN_ISA:13:12}
fi
SOURCE="${BASH_SOURCE[0]}"
HIP_PATH="$( command cd -P "$( dirname "$SOURCE" )/.." && pwd )"
export KMDUMPISA=1
export KMDUMPLLVM=1
mkdir /tmp/hipgenisa
$HIP_PATH/bin/hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
mv dump.* /tmp/hipgenisa/
$ROCM_PATH/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
$ROCM_PATH/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
rm -r /tmp/hipgenisa
export KMDUMPISA=0
export KMDUMPLLVM=0
A különbségek nem kerülnek megjelenítésre, mivel a fájl túl nagy Load Diff
@@ -42,12 +42,12 @@
| **CUDA** | **HIP** | **CUDA description** |
|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
| `cudaStreamAddCallback` | | Add a callback to a compute stream. |
| `cudaStreamAttachMemAsync` | | Attach memory to a stream asynchronously. |
| `cudaStreamAttachMemAsync` | | Attach managed memory to a stream asynchronously. |
| `cudaStreamCreate` | `hipStreamCreate` | Create an asynchronous stream. |
| `cudaStreamCreateWithFlags` | `hipStreamCreateWithFlags` | Create an asynchronous stream. |
| `cudaStreamCreateWithPriority` | | Create an asynchronous stream with the specified priority. |
| `cudaStreamDestroy` | `hipStreamDestroy` | Destroys and cleans up an asynchronous stream. |
| `cudaStreamGetFlags` | | Query the flags of a stream. |
| `cudaStreamGetFlags` | `hipStreamGetFlags` | Query the flags of a stream. |
| `cudaStreamGetPriority` | | Query the priority of a stream. |
| `cudaStreamQuery` | | Queries an asynchronous stream for completion status. |
| `cudaStreamSynchronize` | `hipStreamSynchronize` | Waits for stream tasks to complete. |
@@ -100,8 +100,8 @@
| `cudaHostAlloc` | `hipHostMalloc` | Allocates page-locked memory on the host. |
| `cudaHostGetDevicePointer` | `hipHostGetDevicePointer` | Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister. |
| `cudaHostGetFlags` | `hipHostGetFlags` | Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc. |
| `cudaHostRegister` | | Registers an existing host memory range for use by CUDA. |
| `cudaHostUnregister` | | Unregisters a memory range that was registered with cudaHostRegister. |
| `cudaHostRegister` | `hipHostRegister` | Registers an existing host memory range for use by CUDA. |
| `cudaHostUnregister` | `hipHostUnregister` | Unregisters a memory range that was registered with cudaHostRegister. |
| `cudaMalloc` | `hipMalloc` | Allocate memory on the device. |
| `cudaMalloc3D` | | Allocates logical 1D, 2D, or 3D memory objects on the device. |
| `cudaMalloc3DArray` | | Allocate an array on the device. |
@@ -231,7 +231,7 @@
| **CUDA** | **HIP** | **CUDA description** |
|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
| `cudaBindSurfaceToArra`y | | Binds an array to a surface. |
| `cudaBindSurfaceToArray` | | Binds an array to a surface. |
| `cudaBindTexture` | | Binds a memory area to a texture. |
| `cudaBindTexture2D` | | Binds a 2D memory area to a texture. |
| `cudaBindTextureToArray` | | Binds an array to a texture. |
+23 -7
Fájl megtekintése
@@ -32,36 +32,52 @@
HIP provides the following:
- Devices (hipSetDevice(), hipGetDeviceProperties(), etc.)
- Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
- Streams (hipStreamCreate(), etc.)
- Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(), etc.)
- Events (hipEventRecord(), hipEventElapsedTime(), etc.)
- Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
- HIP Module API to control when adn how code is loaded.
- CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
- Cross-lane instructions including shfl, ballot, any, all
- Most device-side math built-ins
- Error reporting (hipGetLastError(), hipGetErrorString())
The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
### What is not supported?
#### Run-time features
#### Runtime/Driver API features
At a high-level, the following features are not supported:
- Textures
- MemcpyToSymbol functions
- Dynamic parallelism (CUDA 5.0)
- Managed memory (CUDA 6.5)
- Graphics interoperation with OpenGL or Direct3D
- CUDA Driver API (Under Development)
- CUDA IPC Functions (Under Development)
- CUDA array, mipmappedArray and pitched memory
- CUDA Driver API
- MemcpyToSymbol functions
- Queue priority controls
See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
#### Kernel language features
- Device-side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0)
- Virtual functions, indirect functions and try/catch (CUDA 4.0)
- `__prof_trigger`
- PTX assembly (CUDA 4.0)
- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information.
- PTX assembly (CUDA 4.0). HCC supports inline GCN assembly.
- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information. These include:
- printf
- assert
- `__restrict__`
- `__launch_bounds__`
- `__threadfence*_`, `__syncthreads*`
- Unbounded loop unroll
### Is HIP a drop-in replacement for CUDA?
No. HIP provides porting tools which do most of the work do convert CUDA code into portable C++ code that uses the HIP APIs.
Most developers will port their code from CUDA to HIP and then maintain the HIP version.
HIP code provides the same performance as coding in native CUDA, plus the benefit that the code can also run on AMD platforms.
HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
### What specific version of CUDA does HIP support?
HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can
+20 -4
Fájl megtekintése
@@ -156,7 +156,7 @@ The `__constant__` keyword is supported. The host writes constant memory before
### `__shared__`
The `__shared__` keyword is supported.
`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. This feature is under development.
`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. HIP uses an alternate syntax based on the HIP_DYNAMIC_SHARED macro.
### `__managed__`
Managed memory, including the `__managed__` keyword, are not supported in HIP.
@@ -537,7 +537,6 @@ HIP supports the following atomic operations.
### Caveats and Features Under-Development:
- HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently.
- wrapping increment and decrement are under development.
## Warp Cross-Lane Functions
@@ -573,8 +572,6 @@ Applications can test whether the target platform supports the any/all instructi
### Warp Shuffle Functions
The following warp shuffle instructions are under development.
Half-float shuffles are not supported. The default width is warpSize---see [Warp Cross-Lane Functions](#warp-cross-lane-functions). Applications should not assume the warpSize is 32 or 64.
```
@@ -670,3 +667,22 @@ The following C++ features are not supported:
- Run-time-type information (RTTI)
- Virtual functions
- Try/catch
## Kernel Compilation
HIP now supports compiling C++/HIP kernels to binary. Eventhough HIP does not support fatbinary (yet), the user can specify the target for which the binary can be generated. The file format for binary is `.co` which means Code Object. The following command builds the binary using `hipcc`.
`hipcc --genisa --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]`
```[TARGET GPU] = fiji/hawaii
[INPUT FILE] = Name of the file containing kernels
[OUTPUT FILE] = Name of the generated code object file```
Note that the kernel file should have `int main(){}` at the end it so that the binary is generated. This happens because HCC generates binaries at linking time rather than compilation
You need 3 things to run kernel in binary.
1. Kernel Binary
2. Name of kernel binary
3. Name of the kernel
We already got first two of them. In order to get name of the kernel, try `objdump -x [OUTPUT FILE]`. OUTPUT FILE is file generated by hipcc during kernel compilation. The output from objdump has symbol to the kernel whose name is mangled with `grid_launch_parm`, `__functor`, `__cxxamp_trampoline`. An example of how it looks is `ZN12_GLOBAL__N_137_Z3Cpy16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf` where `Cpy` is the name of the kernel written in C++.
@@ -0,0 +1,42 @@
# HIP Performance Optimizations
Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
### Unpinned Memory Transfer Optimizations
#### On Small BAR Setup
There are two possible ways to transfer data from Host to Device (H2D) and Device to Host(D2H)
* Using Staging Buffers
* Using PinInPlace
#### On Large BAR Setup
There are two possible ways to transfer data from Host to Device (H2D)
* Using Staging Buffers
* Using PinInPlace
* Direct Memcpy
And there are two possible ways to transfer data from Device to Host (D2H)
* Using Staging Buffers
* Using PinInPlace
Some GPUs may not be able to directly access host memory, and in these cases we need to
stage the copy through an optimized pinned staging buffer, to implement H2D and D2H copies.The copy is broken into buffer-sized chunks to limit the size of the buffer and also to provide better performance by overlapping the CPU copies with the DMA copies.
PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA
engine.
By default staging buffers are used for unpinned memory transfers, however other ways can be used by enabling few environment variables (so no need to build the code again!!!)
Following environment variables can be used:
- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
- HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
- HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
- HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
@@ -0,0 +1,257 @@
# Porting CUDA Driver API
## Introduction to the CUDA Driver and Runtime APIs
CUDA provides a separate CUDA Driver and Runtime APIs. The two APis have significant overlap in functionality:
- Both APIs support events, streams, memory management, memory copy, and error handling.
- Both APIs deliver similar performance.
- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
- The Driver API defines a different but largely overlapping error code code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
### cuModule API
The Module section of the Driver API provides additional control over how and when accelerator
code objects are loaded.
For example, the driver API allows code objects to be loaded from files or memory pointers.
Symbols for kernels or global data can be extracted from the loaded code objects.
In contrast, the Runtime API automatically loads and (if necessary) compiles all of the
kernels from a executable binary when run.
In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles
a special function call and is easier to use than explicit launch API (in particular with respect to
handling of kernel arguments).
However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.
The Module features are useful in an environment which generate the code objects directly, such as a new
accelerator language front-end. Here, NVCC is not used. Instead, the environment may have a
different kernel language or different compilation flow.
Other environments have many kernels and do not want them to be all loaded automatically.
The Module functions can be used to load the generated code objects and launch kernels.
As we will see below, HIP defines a Module API which provides similar explict control over code
object managemenet.
### cuCtx API
The Driver API defines "Context" and "Devices" as separate entities.
Contexts contain a single device, and a device can theoretically have multiple contexts.
Each context contains a set of streams and events specific to the context.
Historically contexts also defined a unique address space for the GPU, though this may not longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).
The Context APIs also provide a mechanism to switch between devices, which allowed
a single CPU thread to send commands to different GPUs. HIP as well as a recent versions
of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or
`cudaSetDevice`.
The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs
and has little loss of functionality since each Context can contain a single device,
and the benefits of multiple contexts has been replaced with other interfaces. HIP provides
a context API to facilitate easy porting from existing Driver codes.
In HIP, the Ctx functions largely provide an alternate syntax for changing the active device.
Most new applications will prefer to use `hipSetDevice` or the stream APIs.
## HIP Module and Ctx APIs
Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules
and Ctx control.
### hipModule API
Like the CUDA Driver API, the Module API provides additional control
over how code is loaded, including options to load code from files or from in-memory pointers.
NVCC and HCC target different architectures and use different code object formats : NVCC
is `cubin` or `ptx` files, while the HCC path is the `hsaco` format. The external
compilers which generate these code objects are responsible for generating and loading
the correct code object for each platform. Notably, there is not a fat binary format that
can contain code for both NVCC and HCC platforms. The following table summarizes the
formats used on each platform:
| Format | APIs | NVCC | HCC |
| --- | --- | --- | --- |
| Code Object | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco |
| Fat Binary | hipModuleLoadFatBin | .fatbin | Under Development |
hipcc uses NVCC and HCC to compile host codes. Both of these may embed code objects
into the final executable, and these code objects will be automatically loaded when
the application starts.
The hipModule API can be used to load additional code objects, and in this way
provides an extended capability to the automatically loaded code objects. HCC allows
both of these capabilities to be used together, if desired. Of course it is possible
to create a program with no kernels and thus no automatic loading.
### hipCtx API
HIP provides a `Ctx` API as a thin layer over the existing Device functions. This Ctx API
can be used to set the current context, or to query properties of the device associated with
the context. The current context is implicitly used by other APIs such as `hipStreamCreate`.
### hipify translation of CUDA Driver API
The hipify tool will convert CUDA Driver APIs for streams, events, memory management to
the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to
`hipEventCreate`. Hipify also converts error code from the Driver namespace and coding
convention to the equivalent HIP error code. Thus, HIP unifies the APis for these common functions.
[hipify support for translating driver API is Under Development]
The memory copy APIs require additional explanation. The CUDA driver includes the memory
direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides
a single memory copy API with a parameter that specifies the direction and additionally
supports a "default" direction where the runtime determines the direction automatically.
HIP provides APis with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
The first flavor may be faster in some cases since they avoid host overhead to detect the
different memory directions.
HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).
### HCC Implementation Notes
#### .hsaco
The .hsaco format used by HCC is described in more detail [here](https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc).
An example and blog that show how to use the format is [here](http://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program). hsaco can be generated by hcc + extractkernel tool,
cloc, the GCN assembler, or other tools.
#### Address Spaces
HCC defines a process-wide address space where the CPU and all devices allocate
addresses from a single unified pool. Thus addresses may be shared between contexts, and
unlike the original CUDA definition a new context does not create a new address space for
the device.
#### Using hipModuleLaunchKernel
`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. The argument `kernelParams` is not fully implemented for HCC. The workaround for it is, to use platform specific macros for each target. Or, `extra` argument can be used which works on both the platforms.
#### Additional Information
- HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
- HCC creates a primary context when the HIP API is called. So in a pure driver API code, HIP/HCC will create a primary context while HIP/NVCC will have empty context stack. HIP/HCC will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
### NVCC Implementation Notes
#### Interoperation between HIP and CUDA Driver
CUDA applications may want to mix CUDA driver code with HIP code (see example below). This
table shows the type equivalence to enable this interaction.
|**HIP Type** |**CU Driver Type**| **CUDA Runtime Type** |
| ---- | ---- | ---- |
| hipModule | CUmodule | |
| hipFunction | CUfunction | |
| hipCtx_t | CUcontext | |
| hipDevice_t | CUdevice | |
| hipStream_t | CUstream | cudaStream_t |
| hipEvent_t | CUevent | cudaEvent_t |
| hipArray | CUarray | cudaArray |
#### Compilation Flags
The hipModule interface does not support the hipModuleLoadEx function, which is used to control PTX compilaton options.
HCC does not use PTX and does not support the same compilation options.
In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step.
Code which requires this functionaly should use platform-specific coding, calling `cuModuleLoadEx`
on the NVCC path and hipModuleLoad on the hcc path. For example:
```
hipModule module;
void *imagePtr = ... ; // Somehow populate data pointer with code object
#ifdef __HIP_PLATFORM_NVCC__
// Use CUDA driver API but write to hipModule since they are same type:
const int numOptions = 1;
CUJit_option options[numOptions];
void * optionValues[numOptions];
options[0] = CU_JIT_MAX_REGISTERS;
unsigned maxRegs=15;
optionValues[0] = (void*) (&maxRegs);
cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
#else // __HIP_PLATFORM_HCC__
// HCC path does not support or require JIT options, so just load the module.
hipModuleLoadData(&module, imagePtr);
#endif
// Back to unified code - both paths above loaded the "module" variable.
hipFunction k;
hipModuleGetFunction(&k, module, "myKernel");
```
The below sample shows how to use `hipModuleGetFunction`.
```
#include<hip_runtime.h>
#include<hip_runtime_api.h>
#include<iostream>
#include<fstream>
#include<vector>
#define LEN 64
#define SIZE LEN<<2
#ifdef __HIP_PLATFORM_HCC__
#define fileName "vcpy_isa.co"
#endif
#ifdef __HIP_PLATFORM_NVCC__
#define fileName "vcpy_isa.ptx"
#endif
#define kernel_name "hello_world"
int main(){
float *A, *B;
hipDeviceptr_t Ad, Bd;
A = new float[LEN];
B = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 0.0f;
std::cout<<A[i] << " "<<B[i]<<std::endl;
}
#ifdef __HIP_PLATFORM_NVCC__
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipCtxCreate(&context, 0, device);
#endif
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpyHtoD(Ad, A, SIZE);
hipMemcpyHtoD(Bd, B, SIZE);
hipModule_t Module;
hipFunction_t Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
std::vector<void*>argBuffer(2);
memcpy(&argBuffer[0], &Ad, sizeof(void*));
memcpy(&argBuffer[1], &Bd, sizeof(void*));
size_t size = argBuffer.size()*sizeof(void*);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END
};
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&con fig);
hipMemcpyDtoH(B, Bd, SIZE);
for(uint32_t i=0;i<LEN;i++){
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
}
#ifdef __HIP_PLATFORM_NVCC__
hipCtxDetach(context);
#endif
return 0;
}
```
+43 -3
Fájl megtekintése
@@ -395,13 +395,53 @@ For new projects or ports which can be re-factored, we recommend the use of the
This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
run hipcc when appropriate.
### Workarounds
## Workarounds
#### warpSize
### warpSize
Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.
## memcpyToSymbol
#### Textures and Cache Control
HIP support for hipMemCpyToSymbol is under-development. This feature allows a kernel
to define a device-side data symbol which can be accessed on the host side. The symbol
can be in __constant or device space. As a workaround, programs can pass the symbol
as an argument to the kernel, and use standard hipMemcpy routines to initialize it.
For example:
Device Code:
```
// Cuda Device Code
__constant__ float Array[1024];
__global__ void Inc(float *Out){
Int tx = hipThreadIdx_x;
Out[tx] = Array[tx] + 1;
}
// HIP Device Code
__global__ void Inc(hipLaunchParm lp, float *Array, float *Out){
Int tx = hipThreadIdx_x;
Out[tx] = Array[tx] + 1;
}
```
Host Code:
```
// CUDA Host Code
cudaMemcpyToSymbol(Array, hostArray, sizeofArray);
// HIP Host Code
hipMemcpy(Array, hostArray, sizeofArray);
```
## threadfence_system
Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
Some implementations can provide this behavior by flushing the GPU L2 cache.
HIP/HCC does not provide this functionality. As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to
disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have
a performance impact.
### Textures and Cache Control
>Texture support is under-development and not yet supported by HIP.
+172
Fájl megtekintése
@@ -0,0 +1,172 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIPCOMPLEX_H
#define HIPCOMPLEX_H
typedef struct{
float x;
float y;
}hipFloatComplex;
__device__ static inline float hipCrealf(hipFloatComplex z){
return z.x;
}
__device__ static inline float hipCimagf(hipFloatComplex z){
return z.y;
}
__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
hipFloatComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
hipFloatComplex ret;
ret.x = z.x;
ret.y = -z.y;
return ret;
}
__device__ static inline float hipCsqabsf(hipFloatComplex z){
return z.x * z.x + z.y * z.y;
}
__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
}
__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
}
__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
float sqabs = hipCsqabsf(q);
hipFloatComplex ret;
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
ret.y = (p.y * q.x - p.x * q.y)/sqabs;
return ret;
}
__device__ static inline float hipCabsf(hipFloatComplex z){
return sqrtf(hipCsqabsf(z));
}
typedef struct{
double x;
double y;
}hipDoubleComplex;
__device__ static inline double hipCreal(hipDoubleComplex z){
return z.x;
}
__device__ static inline double hipCimag(hipDoubleComplex z){
return z.y;
}
__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
hipDoubleComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
hipDoubleComplex ret;
ret.x = z.x;
ret.y = z.y;
return ret;
}
__device__ static inline double hipCsqabs(hipDoubleComplex z){
return z.x * z.x + z.y * z.y;
}
__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
}
__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
}
__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
double sqabs = hipCsqabs(q);
hipDoubleComplex ret;
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
ret.y = (p.y * q.x - p.x * q.y)/sqabs;
return ret;
}
__device__ static inline double hipCabs(hipDoubleComplex z){
return sqrtf(hipCsqabs(z));
}
typedef hipFloatComplex hipComplex;
__device__ static inline hipComplex make_hipComplex(float x,
float y){
return make_hipFloatComplex(x, y);
}
__device__ static inline hipFloatComplex hipComplexDoubleToFloat
(hipDoubleComplex z){
return make_hipFloatComplex((float)z.x, (float)z.y);
}
__device__ static inline hipDoubleComplex hipComplexFloatToDouble
(hipFloatComplex z){
return make_hipDoubleComplex((double)z.x, (double)z.y);
}
__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
float real = (p.x * q.x) + r.x;
float imag = (q.x * p.y) + r.y;
real = -(p.y * q.y) + real;
imag = (p.x * q.y) + imag;
return make_hipComplex(real, imag);
}
__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
float real = (p.x * q.x) + r.x;
float imag = (q.x * p.y) + r.y;
real = -(p.y * q.y) + real;
imag = (p.x * q.y) + imag;
return make_hipDoubleComplex(real, imag);
}
#endif
+29 -4
Fájl megtekintése
@@ -160,12 +160,17 @@ class ihipCtx_t;
#endif
// Just initialize the HIP runtime, but don't log any trace information.
#define HIP_INIT()\
std::call_once(hip_initialized, ihipInit);\
ihipCtxStackUpdate();
// This macro should be called at the beginning of every HIP API.
// It initialies the hip runtime (exactly once), and
// generate trace string that can be output to stderr or to ATP file.
#define HIP_INIT_API(...) \
std::call_once(hip_initialized, ihipInit);\
HIP_INIT()\
API_TRACE(__VA_ARGS__);
#define ihipLogStatus(hipStatus) \
@@ -396,6 +401,22 @@ public:
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
class ihipModule_t{
public:
hsa_executable_t executable;
hsa_code_object_t object;
std::string fileName;
void *ptr;
size_t size;
};
class ihipFunction_t{
public:
hsa_executable_symbol_t kernel_symbol;
uint64_t kernel;
};
// Internal stream structure.
class ihipStream_t {
public:
@@ -404,8 +425,9 @@ typedef uint64_t SeqNum_t ;
~ihipStream_t();
// kind is hipMemcpyKind
void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind);
void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind);
void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
@@ -423,7 +445,7 @@ typedef uint64_t SeqNum_t ;
// Use this if we already have the stream critical data mutex:
void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
void launchModuleKernel(hsa_signal_t signal, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel);
// Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function.
SIGSEQNUM lastCopySeqId (LockedAccessor_StreamCrit_t &crit) const { return crit->_last_copy_signal ? crit->_last_copy_signal->_sigId : 0; };
@@ -443,6 +465,7 @@ public:
hc::accelerator_view _av;
unsigned _flags;
private:
void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal);
void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal);
@@ -648,6 +671,8 @@ extern void ihipInit();
extern const char *ihipErrorString(hipError_t);
extern ihipCtx_t *ihipGetTlsDefaultCtx();
extern void ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
extern hipError_t ihipSynchronize(void);
extern hipError_t ihipCtxStackUpdate();
extern ihipDevice_t *ihipGetDevice(int);
ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
+60 -9
Fájl megtekintése
@@ -568,16 +568,66 @@ __device__ void __threadfence_system(void);
#define hipGridDim_y (hc_get_num_groups(1))
#define hipGridDim_z (hc_get_num_groups(2))
// loop unrolling
__device__ static inline void* memcpy(void* dst, void* src, size_t size)
{
uint64_t i = 0;
uint64_t totalLength = size/sizeof(uint32_t);
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
i<(totalLength/4);
i = i + hipBlockDim_x * hipGridDim_x)
{
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
}
if(4*i < totalLength){
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
}
return nullptr;
}
__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
{
uint32_t _val = 0;
_val = (val | val << 8 | val << 16 | val << 24);
uint64_t totalLength = size/sizeof(uint32_t);
uint64_t i = 0;
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
i<(totalLength/4);
i = i + hipBlockDim_x * hipGridDim_x)
{
((uint32_t*)ptr)[4*i] = _val;
((uint32_t*)ptr)[4*i+1] = _val;
((uint32_t*)ptr)[4*i+2] = _val;
((uint32_t*)ptr)[4*i+3] = _val;
}
if(4*i < totalLength){
((uint32_t*)ptr)[4*i] = _val;
((uint32_t*)ptr)[4*i+1] = _val;
((uint32_t*)ptr)[4*i+2] = _val;
((uint32_t*)ptr)[4*i+3] = _val;
}
return nullptr;
}
#define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
#define HIP_KERNEL_NAME(...) __VA_ARGS__
#ifdef __HCC_CPP__
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
extern void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream);
extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
// TODO - move to common header file.
#define KNRM "\x1B[0m"
@@ -589,10 +639,9 @@ do {\
grid_launch_parm lp;\
lp.dynamic_group_mem_bytes = _groupMemBytes; \
hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
if (HIP_TRACE_API) {\
fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
#_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.dynamic_group_mem_bytes, (void*)(_stream));\
}\
if (HIP_TRACE_API) {\
ihipPrintKernelLaunch(#_kernelName, &lp, _stream); \
}\
_kernelName (lp, ##__VA_ARGS__);\
ihipPostLaunchKernel(trueStream, lp);\
} while(0)
@@ -653,4 +702,6 @@ do {\
*/
#endif
+231 -10
Fájl megtekintése
@@ -52,13 +52,13 @@ typedef struct ihipDevice_t *hipDevice_t;
typedef struct ihipStream_t *hipStream_t;
typedef uint64_t hipFunction;
typedef struct ihipModule_t *hipModule_t;
typedef uint64_t hipModule;
typedef struct ihipFunction_t *hipFunction_t;
typedef struct hipEvent_t {
struct ihipEvent_t *_handle;
} hipEvent_t;
typedef void* hipDeviceptr_t;
typedef struct ihipEvent_t *hipEvent_t;
/**
@@ -844,6 +844,14 @@ hipError_t hipHostFree(void* ptr);
*/
hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes);
/**
* @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area pointed to by @p offset bytes from the start of symbol @p symbol.
@@ -1054,32 +1062,173 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
hipError_t hipInit(unsigned int flags) ;
/**
*-------------------------------------------------------------------------------------------------
*-------------------------------------------------------------------------------------------------
* @defgroup Context Management
* @{
*/
// TODO-ctx
/**
* @brief Create a context and set it as current/ default context
*
* @param [out] ctx
* @param [in] flags
* @param [in] associated device handle
*
* @returns #hipSuccess, #hipErrorInvalidContext
*/
hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device);
hipError_t hipCtxDestroy(hipCtx_t ctx);
/**
* @brief Pop the current/default context and return the popped context.
*
* @param [out] ctx
*
* @returns #hipSuccess
*/
hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
/**
* @brief Push the context to be set as current/ default context
*
* @param [in] ctx
*
* @returns #hipSuccess, #hipErrorInvalidContext
*/
hipError_t hipCtxPushCurrent(hipCtx_t ctx);
/**
* @brief Set the passed context as current/default
*
* @param [in] ctx
*
* @returns #hipSuccess
*/
hipError_t hipCtxSetCurrent(hipCtx_t ctx);
/**
* @brief Get the handle of the current/ default context
*
* @param [out] ctx
*
* @returns #hipSuccess
*/
hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
/**
* @brief Get the handle of the device associated with current/default context
*
* @param [out] device
*
* @returns #hipSuccess, #hipErrorInvalidContext
*/
hipError_t hipCtxGetDevice(hipDevice_t *device);
/**
* @brief Returns the approximate HIP api version.
*
* @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
* This function always set *apiVersion to 4 as an approximation though HIP supports
* some features which were introduced in later CUDA SDK revisions.
* HIP apps code should not rely on the api revision number here and should
* use arch feature flags to test device capabilities or conditional compilation.
*
*/
hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion);
/**
* @brief Set Cache configuration for a specific function
*
* Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures.
*
*/
hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig );
/**
* @brief Set L1/Shared cache partition.
*
* Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures.
*
*/
hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig );
/**
* @brief Set Shared memory bank configuration.
*
* Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
*
*/
hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config );
/**
* @brief Get Shared memory bank configuration.
*
* Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
*
*/
hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig );
/**
* @brief Blocks until the default context has completed all preceding requested tasks.
*
* This function waits for all streams on the default context to complete execution, and then returns.
*
* @returns #hipSuccess.
*/
hipError_t hipCtxSynchronize ( void );
/**
* @brief Get flags used for creating current/default context.
*
* @param [out] flags
*
* @returns #hipSuccess.
*/
hipError_t hipCtxGetFlags ( unsigned int* flags );
/**
* @brief Enables direct access to memory allocations in a peer context.
*
* Memory which already allocated on peer device will be mapped into the address space of the current device. In addition, all
* future memory allocations on peerDeviceId will be mapped into the address space of the current device when the memory is allocated.
* The peer memory remains accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
*
*
* @param [in] peerCtx
* @param [in] flags
*
* Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
* @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
* @warning PeerToPeer support is experimental.
*/
hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags);
/**
* @brief Disable direct access from current context's virtual address space to memory allocations physically located on a peer context.Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
*
* Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
*
* @param [in] peerCtx
*
* @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
* @warning PeerToPeer support is experimental.
*/
hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx);
// doxygen end Context Management
/**
* @}
*/
// TODO-ctx
/**
* @return hipSuccess, hipErrorInvalidDevice
@@ -1099,12 +1248,84 @@ hipError_t hipDeviceGetFromId(hipDevice_t *device, int deviceId);
*/
hipError_t hipDriverGetVersion(int *driverVersion) ;
/**
* @brief Loads code object from file into a hipModule_t
*
* @param [in] fname
* @param [out] module
*
* @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound, hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
*
*
*/
hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
hipError_t hipModuleLoad(hipModule *module, const char *fname);
/**
* @brief Freeing the module
*
* @param [in] module
*
* @returns hipSuccess, hipInvalidValue
* module is freed and the code objects associated with it are destroyed
*
*/
hipError_t hipModuleGetFunction(hipFunction *function, hipModule module, const char *kname);
hipError_t hipModuleUnload(hipModule_t module);
hipError_t hipLaunchModuleKernel(hipFunction f,
/**
* @brief Function with kname will be extracted present in module
*
* @param [in] module
* @param [in] kname
* @param [out] function
*
* @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized, hipErrorNotFound,
*/
hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
/**
* @brief returns device memory pointer and size of the kernel present in the module with symbol - name
*
* @param [in] moodule
* @param [in] name
* @param [out] dptr
* @param [out[ bytes
*
* @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
*/
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
/**
* @brief builds module from code object which resides in host memory. And image is pointer to that location.
*
* @param [in] image
* @param [out] module
*
* @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
*/
hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
/**
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
*
* @param [in[ f
* @param [in] gridDimX
* @param [in] gridDimY
* @param [in] gridDimZ
* @param [in] blockDimX
* @param [in] blockDimY
* @param [in] blockDimZ
* @param [in] sharedMemBytes
* @param [in] stream
* @param [in] kernelParams
* @param [in] extraa
*
* The function takes the above arguments and run the kernel in hipFunction_t f. with launch parameters specified in gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
*
* @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
*/
hipError_t hipModuleLaunchKernel(hipFunction_t f,
unsigned int gridDimX,
unsigned int gridDimY,
unsigned int gridDimZ,
@@ -1114,7 +1335,7 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
unsigned int sharedMemBytes,
hipStream_t stream,
void **kernelParams,
void **extra) __attribute__((deprecated("kernelParams is not fully supported, use extra instead"))) ;
void **extra) ;
// doxygen end Version Management
/**
+31
Fájl megtekintése
@@ -0,0 +1,31 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hip/hip_common.h>
#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
#include <hip/hcc_detail/hipComplex.h>
#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
#include <hip/nvcc_detail/hipComplex.h>
#else
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
#endif
+108
Fájl megtekintése
@@ -0,0 +1,108 @@
#ifndef HIPCOMPLEX_H
#define HIPCOMPLEX_H
#include"cuComplex.h"
typedef cuFloatComplex hipFloatComplex;
__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
return cuCrealf(z);
}
__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
return cuCimagf(z);
}
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
return make_cuFloatComplex(a, b);
}
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
return cuConjf(z);
}
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
return cuCabsf(z) * cuCabsf(z);
}
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
return cuCaddf(p, q);
}
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
return cuCsubf(p, q);
}
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
return cuCmulf(p, q);
}
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
return cuCdivf(p, q);
}
__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
return cuCabsf(p, q);
}
typedef cuDoubleComplex hipDoubleComplex;
__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
return cuCreal(z);
}
__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
return cuCimag(z);
}
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
return make_cuDoubleComplex(a, b);
}
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
return cuConj(z);
}
__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){
return cuCabs(z) * cuCabs(z);
}
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
return cuCadd(p, q);
}
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
return cuCsub(p, q);
}
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
return cuCdiv(p, q);
}
__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
return cuCabs(z);
}
typedef cuFloatComplex hipComplex;
__device__ __host__ static inline hipComplex make_Complex(float x, float y){
return make_cuComplex(x, y);
}
__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z){
return cuComplexDoubleToFloat(z);
}
__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z){
return cuComplexFloatToDouble(z);
}
__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
return cuCfmaf(p, q, r);
}
__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){
return cuCfma(p, q, r);
}
#endif
+194 -16
Fájl megtekintése
@@ -22,7 +22,7 @@ THE SOFTWARE.
#pragma once
#include <cuda_runtime_api.h>
#include <cuda.h>
#ifdef __cplusplus
extern "C" {
@@ -58,8 +58,20 @@ hipMemcpyHostToHost
#define hipHostRegisterPortable cudaHostRegisterPortable
#define hipHostRegisterMapped cudaHostRegisterMapped
#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
typedef cudaEvent_t hipEvent_t;
typedef cudaStream_t hipStream_t;
typedef CUcontext hipCtx_t;
typedef CUsharedconfig hipSharedMemConfig;
typedef CUfunc_cache hipFuncCache;
typedef CUdevice hipDevice_t;
typedef CUmodule hipModule_t;
typedef CUfunction hipFunction_t;
typedef CUdeviceptr hipDeviceptr_t;
//typedef cudaChannelFormatDesc hipChannelFormatDesc;
#define hipChannelFormatDesc cudaChannelFormatDesc
@@ -85,6 +97,20 @@ switch(cuError) {
};
}
inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further
switch(cuError) {
case CUDA_SUCCESS : return hipSuccess;
case CUDA_ERROR_OUT_OF_MEMORY : return hipErrorMemoryAllocation ;
case CUDA_ERROR_INVALID_VALUE : return hipErrorInvalidValue ;
case CUDA_ERROR_INVALID_DEVICE : return hipErrorInvalidDevice ;
case CUDA_ERROR_DEINITIALIZED : return hipErrorDeinitialized ;
case CUDA_ERROR_NO_DEVICE : return hipErrorNoDevice ;
case CUDA_ERROR_INVALID_CONTEXT : return hipErrorInvalidContext ;
case CUDA_ERROR_NOT_INITIALIZED : return hipErrorNotInitialized ;
default : return hipErrorUnknown; // Note - translated error.
};
}
// TODO match the error enum names of hip and cuda
inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
switch(hError) {
@@ -124,6 +150,11 @@ default:
}
}
inline static hipError_t hipInit(unsigned int flags)
{
return hipCUResultTohipError(cuInit(flags));
}
inline static hipError_t hipDeviceReset() {
return hipCUDAErrorTohipError(cudaDeviceReset());
}
@@ -182,6 +213,25 @@ inline static hipError_t hipHostFree(void* ptr) {
inline static hipError_t hipSetDevice(int device) {
return hipCUDAErrorTohipError(cudaSetDevice(device));
}
inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst,
void* src, size_t size)
{
return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
}
inline static hipError_t hipMemcpyDtoH(void* dst,
hipDeviceptr_t src, size_t size)
{
return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
}
inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst,
hipDeviceptr_t src, size_t size)
{
return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
}
inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind) {
return hipCUDAErrorTohipError(cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
}
@@ -347,20 +397,6 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
return hipCUDAErrorTohipError(cerror);
}
template<class T>
inline static hipError_t hipOccupancyMaxPotentialBlockSize(
int *minGridSize,
int *blockSize,
T func,
size_t dynamicSMemSize = 0,
int blockSizeLimit = 0,
unsigned int flags = 0
){
cudaError_t cerror;
cerror = cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
return hipCUDAErrorTohipError(cerror);
}
inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
int *numBlocks,
const void* func,
@@ -458,7 +494,6 @@ inline static hipError_t hipDriverGetVersion(int *driverVersion)
return hipCUDAErrorTohipError(err);
}
inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice )
{
return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
@@ -474,6 +509,16 @@ inline static hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned
return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags ));
}
inline static hipError_t hipCtxDisablePeerAccess ( hipCtx_t peerCtx )
{
return hipCUResultTohipError(cuCtxDisablePeerAccess ( peerCtx ));
}
inline static hipError_t hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int flags )
{
return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags ));
}
inline static hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count )
{
return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count ));
@@ -499,12 +544,145 @@ inline static hipError_t hipEventQuery(hipEvent_t event)
return hipCUDAErrorTohipError(cudaEventQuery(event));
}
inline static hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
{
return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device ));
}
inline static hipError_t hipCtxDestroy(hipCtx_t ctx)
{
return hipCUResultTohipError(cuCtxDestroy ( ctx ));
}
inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
{
return hipCUResultTohipError(cuCtxPopCurrent ( ctx ));
}
inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx)
{
return hipCUResultTohipError(cuCtxPushCurrent ( ctx ));
}
inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx)
{
return hipCUResultTohipError(cuCtxSetCurrent ( ctx ));
}
inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
{
return hipCUResultTohipError(cuCtxGetCurrent ( ctx ));
}
inline static hipError_t hipCtxGetDevice(hipDevice_t *device)
{
return hipCUResultTohipError(cuCtxGetDevice ( device ));
}
inline static hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
{
return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion ));
}
inline static hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
{
return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig ));
}
inline static hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
{
return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig ));
}
inline static hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
{
return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config ));
}
inline static hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
{
return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig ));
}
inline static hipError_t hipCtxSynchronize ( void )
{
return hipCUResultTohipError(cuCtxSynchronize ( ));
}
inline static hipError_t hipCtxGetFlags ( unsigned int* flags )
{
return hipCUResultTohipError(cuCtxGetFlags ( flags ));
}
inline static hipError_t hipCtxDetach(hipCtx_t ctx)
{
return hipCUResultTohipError(cuCtxDetach(ctx));
}
inline static hipError_t hipDeviceGet(hipDevice_t *device, int ordinal)
{
return hipCUResultTohipError(cuDeviceGet(device, ordinal));
}
inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
{
return hipCUResultTohipError(cuModuleLoad(module, fname));
}
inline static hipError_t hipModuleUnload(hipModule_t hmod)
{
return hipCUResultTohipError(cuModuleUnload(hmod));
}
inline static hipError_t hipModuleGetFunction(hipFunction_t *function,
hipModule_t module, const char *kname)
{
return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
}
inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
hipModule_t hmod, const char* name)
{
return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
}
inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
{
return hipCUResultTohipError(cuModuleLoadData(module, image));
}
inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes, hipStream_t stream,
void **kernelParams, void **extra)
{
return hipCUResultTohipError(cuLaunchKernel(f,
gridDimX, gridDimY, gridDimZ,
blockDimX, blockDimY, blockDimZ,
sharedMemBytes, stream, kernelParams, extra));
}
#ifdef __cplusplus
}
#endif
#ifdef __CUDACC__
template<class T>
inline static hipError_t hipOccupancyMaxPotentialBlockSize(
int *minGridSize,
int *blockSize,
T func,
size_t dynamicSMemSize = 0,
int blockSizeLimit = 0,
unsigned int flags = 0
){
cudaError_t cerror;
cerror = cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
return hipCUDAErrorTohipError(cerror);
}
template <class T, int dim, enum cudaTextureReadMode readMode>
inline static hipError_t hipBindTexture(size_t *offset,
const struct texture<T, dim, readMode> &tex,
@@ -0,0 +1,21 @@
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
ifeq (${HIP_PLATFORM}, hcc)
GENCODEOBJECT_FLAGS=--target-isa-fiji
endif
all: vcpy_isa.compile runKernel.hip.out
vcpy_isa.compile: vcpy_isa.cpp
$(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co
runKernel.hip.out: runKernel.cpp
$(HIPCC) runKernel.cpp -o runKernel.hip.out
clean:
rm -f *.co *.out
@@ -0,0 +1,105 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include<hip_runtime.h>
#include<hip_runtime_api.h>
#include<iostream>
#include<fstream>
#include<vector>
#define LEN 64
#define SIZE LEN<<2
#ifdef __HIP_PLATFORM_HCC__
#define fileName "vcpy_isa.co"
#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf"
#endif
#ifdef __HIP_PLATFORM_NVCC__
#define fileName "vcpy_isa.ptx"
#define kernel_name "hello_world"
#endif
int main(){
float *A, *B;
hipDeviceptr_t Ad, Bd;
A = new float[LEN];
B = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 0.0f;
}
#ifdef __HIP_PLATFORM_NVCC__
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipCtxCreate(&context, 0, device);
#endif
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpyHtoD(Ad, A, SIZE);
hipMemcpyHtoD(Bd, B, SIZE);
hipModule_t Module;
hipFunction_t Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
uint32_t len = LEN;
uint32_t one = 1;
std::vector<void*>argBuffer(5);
uint32_t *ptr32_t = (uint32_t*)&argBuffer[0];
memcpy(ptr32_t + 0, &one, sizeof(uint32_t));
memcpy(ptr32_t + 1, &one, sizeof(uint32_t));
memcpy(ptr32_t + 2, &one, sizeof(uint32_t));
memcpy(ptr32_t + 3, &len, sizeof(uint32_t));
memcpy(ptr32_t + 4, &one, sizeof(uint32_t));
memcpy(ptr32_t + 5, &one, sizeof(uint32_t));
memcpy(&argBuffer[3], &Ad, sizeof(void*));
memcpy(&argBuffer[4], &Bd, sizeof(void*));
size_t size = argBuffer.size()*sizeof(void*);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END
};
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
hipMemcpyDtoH(B, Bd, SIZE);
for(uint32_t i=LEN-4;i<LEN;i++){
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
}
#ifdef __HIP_PLATFORM_NVCC__
hipCtxDetach(context);
#endif
return 0;
}
@@ -0,0 +1,9 @@
#include<hip_runtime.h>
__global__ void hello_world(hipLaunchParm lp, float *a, float *b)
{
int tx = hipThreadIdx_x;
b[tx] = a[tx];
}
int main(){}
@@ -0,0 +1,6 @@
extern "C" __global__ void hello_world(float *a, float *b)
{
int tx = threadIdx.x;
b[tx] = a[tx];
}
@@ -0,0 +1,38 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//
.version 4.3
.target sm_20
.address_size 64
// .globl hello_world
.visible .entry hello_world(
.param .u64 hello_world_param_0,
.param .u64 hello_world_param_1
)
{
.reg .f32 %f<2>;
.reg .b32 %r<2>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [hello_world_param_0];
ld.param.u64 %rd2, [hello_world_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
add.s64 %rd7, %rd3, %rd5;
st.global.f32 [%rd7], %f1;
ret;
}
+116 -11
Fájl megtekintése
@@ -29,7 +29,22 @@ THE SOFTWARE.
// Stack of contexts
thread_local std::stack<ihipCtx_t *> tls_ctxStack;
hipError_t ihipCtxStackUpdate()
{
//HIP_INIT_API();
hipError_t e = hipSuccess;
if(tls_ctxStack.empty()) {
tls_ctxStack.push(ihipGetTlsDefaultCtx());
}
return ihipLogStatus(e);
}
/**
* @return #hipSuccess, #hipErrorInvalidValue
*/
//---
hipError_t hipInit(unsigned int flags)
{
HIP_INIT_API(flags);
@@ -44,7 +59,10 @@ hipError_t hipInit(unsigned int flags)
return ihipLogStatus(e);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
{
HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init
@@ -57,7 +75,10 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
return ihipLogStatus(e);
}
/**
* @return #hipSuccess, #hipErrorInvalidDevice
*/
//---
hipError_t hipDeviceGet(hipDevice_t *device, int deviceId)
{
HIP_INIT_API(device, deviceId); // FIXME - review if we want to init
@@ -88,20 +109,39 @@ hipError_t hipDriverGetVersion(int *driverVersion)
return ihipLogStatus(hipSuccess);
}
/**
* @return #hipSuccess, #hipErrorInvalidValue
*/
//---
hipError_t hipCtxDestroy(hipCtx_t ctx)
{
HIP_INIT_API(ctx);
hipError_t e = hipSuccess;
ihipCtx_t* currentCtx= ihipGetTlsDefaultCtx();
if(currentCtx == ctx) {
//need to destroy the ctx associated with calling thread
tls_ctxStack.pop();
ihipCtx_t* primaryCtx= ((ihipDevice_t*)ctx->getDevice())->_primaryCtx;
if(primaryCtx== ctx)
{
e = hipErrorInvalidValue;
}
delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
else
{
if(currentCtx == ctx) {
//need to destroy the ctx associated with calling thread
tls_ctxStack.pop();
}
delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
}
return ihipLogStatus(e);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
{
HIP_INIT_API(ctx);
hipError_t e = hipSuccess;
ihipCtx_t* tempCtx;
*ctx = ihipGetTlsDefaultCtx();
@@ -119,8 +159,13 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
return ihipLogStatus(e);
}
/**
* @return #hipSuccess, #hipErrorInvalidContext
*/
//---
hipError_t hipCtxPushCurrent(hipCtx_t ctx)
{
HIP_INIT_API(ctx);
hipError_t e = hipSuccess;
if(ctx != NULL) { //TODO- is this check needed?
ihipSetTlsDefaultCtx(ctx);
@@ -132,19 +177,30 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx)
return ihipLogStatus(e);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
{
HIP_INIT_API(ctx);
hipError_t e = hipSuccess;
*ctx = ihipGetTlsDefaultCtx();
if(*ctx == nullptr) {
*ctx = NULL; //TODO - is it required? Can return nullptr?
if(!tls_ctxStack.empty()) {
*ctx= tls_ctxStack.top();
}
else {
*ctx = NULL;
}
return ihipLogStatus(e);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxSetCurrent(hipCtx_t ctx)
{
HIP_INIT_API(ctx);
hipError_t e = hipSuccess;
if(ctx == NULL) {
tls_ctxStack.pop();
@@ -156,8 +212,13 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx)
return ihipLogStatus(e);
}
/**
* @return #hipSuccess, #hipErrorInvalidContext
*/
//---
hipError_t hipCtxGetDevice(hipDevice_t *device)
{
HIP_INIT_API(device);
hipError_t e = hipSuccess;
ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
@@ -171,6 +232,10 @@ hipError_t hipCtxGetDevice(hipDevice_t *device)
return ihipLogStatus(e);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
{
HIP_INIT_API(apiVersion);
@@ -182,6 +247,10 @@ hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
return ihipLogStatus(hipSuccess);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
{
HIP_INIT_API(cacheConfig);
@@ -191,6 +260,10 @@ hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
return ihipLogStatus(hipSuccess);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
{
HIP_INIT_API(cacheConfig);
@@ -200,6 +273,10 @@ hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
return ihipLogStatus(hipSuccess);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
{
HIP_INIT_API(config);
@@ -209,6 +286,10 @@ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
return ihipLogStatus(hipSuccess);
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
{
HIP_INIT_API(pConfig);
@@ -216,4 +297,28 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
*pConfig = hipSharedMemBankSizeFourByte;
return ihipLogStatus(hipSuccess);
}
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxSynchronize ( void )
{
HIP_INIT_API(1);
return ihipSynchronize(); //TODP Shall check validity of ctx?
}
/**
* @return #hipSuccess
*/
//---
hipError_t hipCtxGetFlags ( unsigned int* flags )
{
HIP_INIT_API(flags);
hipError_t e = hipSuccess;
ihipCtx_t* tempCtx;
tempCtx = ihipGetTlsDefaultCtx();
*flags = tempCtx->_ctxFlags;
return ihipLogStatus(e);
}
+2 -5
Fájl megtekintése
@@ -160,11 +160,8 @@ hipError_t hipSetDevice(int deviceId)
*/
hipError_t hipDeviceSynchronize(void)
{
HIP_INIT_API();
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
return ihipLogStatus(hipSuccess);
HIP_INIT_API(1);
return ihipSynchronize();
}
+10 -9
Fájl megtekintése
@@ -33,13 +33,14 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
// TODO - support hipEventDefault, hipEventBlockingSync, hipEventDisableTiming
if (flags == 0) {
ihipEvent_t *eh = event->_handle = new ihipEvent_t();
ihipEvent_t *eh = new ihipEvent_t();
eh->_state = hipEventStatusCreated;
eh->_stream = NULL;
eh->_flags = flags;
eh->_timestamp = 0;
eh->_copySeqId = 0;
*event = eh;
} else {
e = hipErrorInvalidValue;
}
@@ -71,7 +72,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
{
HIP_INIT_API(event, stream);
ihipEvent_t *eh = event._handle;
ihipEvent_t *eh = event;
if (eh && eh->_state != hipEventStatusUnitialized) {
eh->_stream = stream;
@@ -106,10 +107,10 @@ hipError_t hipEventDestroy(hipEvent_t event)
{
HIP_INIT_API(event);
event._handle->_state = hipEventStatusUnitialized;
event->_state = hipEventStatusUnitialized;
delete event._handle;
event._handle = NULL;
delete event;
event = NULL;
// TODO - examine return additional error codes
return ihipLogStatus(hipSuccess);
@@ -121,7 +122,7 @@ hipError_t hipEventSynchronize(hipEvent_t event)
{
HIP_INIT_API(event);
ihipEvent_t *eh = event._handle;
ihipEvent_t *eh = event;
if (eh) {
if (eh->_state == hipEventStatusUnitialized) {
@@ -150,8 +151,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
{
HIP_INIT_API(ms, start, stop);
ihipEvent_t *start_eh = start._handle;
ihipEvent_t *stop_eh = stop._handle;
ihipEvent_t *start_eh = start;
ihipEvent_t *stop_eh = stop;
ihipSetTs(start);
ihipSetTs(stop);
@@ -195,7 +196,7 @@ hipError_t hipEventQuery(hipEvent_t event)
{
HIP_INIT_API(event);
ihipEvent_t *eh = event._handle;
ihipEvent_t *eh = event;
// TODO-stream - need to read state of signal here: The event may have become ready after recording..
// TODO-HCC - use get_hsa_signal here.
+65 -8
Fájl megtekintése
@@ -158,7 +158,12 @@ ihipCtx_t *ihipGetTlsDefaultCtx()
return tls_defaultCtx;
}
hipError_t ihipSynchronize(void)
{
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
return ihipLogStatus(hipSuccess);
}
//=================================================================================================
// ihipSignal_t:
@@ -482,6 +487,53 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
}
void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
uint32_t blockDimX,
uint32_t blockDimY,
uint32_t blockDimZ,
uint32_t gridDimX,
uint32_t gridDimY,
uint32_t gridDimZ,
uint32_t sharedMemBytes,
void *kernarg,
size_t kernSize,
uint64_t kernel){
hsa_status_t status;
void *kern;
hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
memcpy(kern, kernarg, kernSize);
hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
const uint32_t queue_mask = Queue->size-1;
uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
dispatch_packet->completion_signal = signal;
dispatch_packet->workgroup_size_x = blockDimX;
dispatch_packet->workgroup_size_y = blockDimY;
dispatch_packet->workgroup_size_z = blockDimZ;
dispatch_packet->grid_size_x = blockDimX * gridDimX;
dispatch_packet->grid_size_y = blockDimY * gridDimY;
dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
dispatch_packet->group_segment_size = 0;
dispatch_packet->private_segment_size = sharedMemBytes;
dispatch_packet->kernarg_address = kern;
dispatch_packet->kernel_object = kernel;
uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
uint32_t header32 = header | (setup << 16);
__atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
hsa_queue_store_write_index_relaxed(Queue, packet_index + 1);
hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
}
//=============================================================================
// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
@@ -1260,13 +1312,20 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
}
}
void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream)
{
std::string streamString = ToString(stream);
fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d %s\n" KNRM, \
kernelName, lp->grid_dim.x, lp->grid_dim.y, lp->grid_dim.z, lp->group_dim.x, lp->group_dim.y, lp->group_dim.z,
lp->dynamic_group_mem_bytes, streamString.c_str());\
}
// TODO - data-up to data-down:
// Called just before a kernel is launched from hipLaunchKernel.
// Allows runtime to track some information about the stream.
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp)
{
HIP_INIT_API(stream, grid, block, lp);
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
#if USE_GRID_LAUNCH_20
lp->grid_dim.x = grid.x;
@@ -1439,7 +1498,7 @@ const char *ihipErrorString(hipError_t hip_error)
void ihipSetTs(hipEvent_t e)
{
ihipEvent_t *eh = e._handle;
ihipEvent_t *eh = e;
if (eh->_state == hipEventStatusRecorded) {
// already recorded, done:
return;
@@ -1509,7 +1568,7 @@ void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType,
}
void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind)
void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
{
ihipCtx_t *ctx = this->getCtx();
const ihipDevice_t *device = ctx->getDevice();
@@ -1528,7 +1587,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
bool dstInDeviceMem = dstPtrInfo._isInDeviceMem;
// Resolve default to a specific Kind so we know which algorithm to use:
if (kind == hipMemcpyDefault) {
if (kind == hipMemcpyDefault && resolveOn) {
kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem);
};
@@ -1699,14 +1758,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
// Sync copy that acquires lock:
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
{
LockedAccessor_StreamCrit_t crit (_criticalData);
copySync(crit, dst, src, sizeBytes, kind);
copySync(crit, dst, src, sizeBytes, kind, resolveOn);
}
void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
{
LockedAccessor_StreamCrit_t crit(_criticalData);
+84 -1
Fájl megtekintése
@@ -446,12 +446,96 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
e = ex._code;
}
return ihipLogStatus(e);
}
hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
{
HIP_INIT_API(dst, src, sizeBytes);
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
hc::completion_future marker;
hipError_t e = hipSuccess;
try {
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false);
}
catch (ihipException ex) {
e = ex._code;
}
return ihipLogStatus(e);
}
hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
{
HIP_INIT_API(dst, src, sizeBytes);
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
hc::completion_future marker;
hipError_t e = hipSuccess;
try {
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false);
}
catch (ihipException ex) {
e = ex._code;
}
return ihipLogStatus(e);
}
hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
{
HIP_INIT_API(dst, src, sizeBytes);
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
hc::completion_future marker;
hipError_t e = hipSuccess;
try {
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false);
}
catch (ihipException ex) {
e = ex._code;
}
return ihipLogStatus(e);
}
hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
{
HIP_INIT_API(dst, src, sizeBytes);
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
hc::completion_future marker;
hipError_t e = hipSuccess;
try {
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false);
}
catch (ihipException ex) {
e = ex._code;
}
return ihipLogStatus(e);
}
/**
* @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection,
* @result #hipErrorInvalidValue : If dst==NULL or src==NULL, or other bad argument.
@@ -786,7 +870,6 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total)
// TODO - replace with kernel-level for reporting free memory:
size_t deviceMemSize, hostMemSize, userMemSize;
hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
printf ("deviceMemSize=%zu\n", deviceMemSize);
*free = device->_props.totalGlobalMem - deviceMemSize;
}
+200 -107
Fájl megtekintése
@@ -20,137 +20,188 @@ THE SOFTWARE.
#include "hip_runtime.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/amd_hsa_kernel_code.h"
#include "hcc_detail/hip_hcc.h"
#include "hcc_detail/trace_helper.h"
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <elf.h>
//TODO Use Pool APIs from HCC to get memory regions.
namespace hipdrv{
hsa_status_t findSystemRegions(hsa_region_t region, void *data){
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
return HSA_STATUS_SUCCESS;
}
hsa_status_t findSystemRegions(hsa_region_t region, void *data){
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
return HSA_STATUS_SUCCESS;
}
hsa_region_t *reg = (hsa_region_t*)data;
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
*reg = region;
}
hsa_region_t *reg = (hsa_region_t*)data;
return HSA_STATUS_SUCCESS;
if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
*reg = region;
}
return HSA_STATUS_SUCCESS;
}
} // End namespace hipdrv
uint64_t PrintSymbolSizes(const void *emi, const char *name){
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
for(uint16_t i=0;i<ehdr->e_shnum;++i){
if(shdr[i].sh_type == SHT_SYMTAB){
const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
assert(syms);
uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
assert(strtab);
for(uint64_t i=0;i<numSyms;++i){
const char *symname = strtab + syms[i].st_name;
assert(symname);
uint64_t size = syms[i].st_size;
if(strcmp(name, symname) == 0){
return size;
}
}
}
}
return 0;
}
hsa_status_t findKernArgRegions(hsa_region_t region, void *data){
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
uint64_t ElfSize(const void *emi){
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
return HSA_STATUS_SUCCESS;
}
uint64_t max_offset = ehdr->e_shoff;
uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
hsa_region_t *reg = (hsa_region_t*)data;
if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
*reg = region;
}
return HSA_STATUS_SUCCESS;
for(uint16_t i=0;i < ehdr->e_shnum;++i){
uint64_t cur_offset = static_cast<uint64_t>(shdr[i].sh_offset);
if(max_offset < cur_offset){
max_offset = cur_offset;
total_size = max_offset;
if(SHT_NOBITS != shdr[i].sh_type){
total_size += static_cast<uint64_t>(shdr[i].sh_size);
}
}
}
return total_size;
}
}
hipError_t hipModuleLoad(hipModule *module, const char *fname){
hipError_t hipModuleLoad(hipModule_t *module, const char *fname){
HIP_INIT_API(fname);
hipError_t ret = hipSuccess;
*module = new ihipModule_t;
if(module == NULL){
ret = hipErrorInvalidValue;
return hipErrorInvalidValue;
}
auto ctx = ihipGetTlsDefaultCtx();
if(ctx == nullptr){
ret = hipErrorInvalidContext;
}else{
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
std::ifstream in(fname, std::ios::binary | std::ios::ate);
if(!in){
return hipErrorFileNotFound;
}else{
*module = new ihipModule_t;
size_t size = std::string::size_type(in.tellg());
void *p = NULL;
hsa_agent_t agent = currentDevice->_hsaAgent;
hsa_region_t sysRegion;
hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
status = hsa_memory_allocate(sysRegion, size, (void**)&p);
if(status != HSA_STATUS_SUCCESS){
return hipErrorOutOfMemory;
}
char *ptr = (char*)p;
if(!ptr){
return hipErrorOutOfMemory;
std::cout<<"Error: failed to allocate memory for code object"<<std::endl;
}
hsa_code_object_t obj;
(*module)->ptr = p;
(*module)->size = size;
in.seekg(0, std::ios::beg);
std::copy(std::istreambuf_iterator<char>(in),
std::istreambuf_iterator<char>(), ptr);
status = hsa_code_object_deserialize(ptr, size, NULL, &obj);
status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
if(status != HSA_STATUS_SUCCESS){
return hipErrorSharedObjectInitFailed;
}
*module = obj.handle;
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotInitialized;
}
}
}
return ret;
}
hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *name){
HIP_INIT_API(name);
hipError_t hipModuleUnload(hipModule_t hmod){
hipError_t ret = hipSuccess;
hsa_status_t status = hsa_executable_destroy(hmod->executable);
if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
status = hsa_code_object_destroy(hmod->object);
if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
delete hmod;
return ret;
}
hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const char *name){
auto ctx = ihipGetTlsDefaultCtx();
hipError_t ret = hipSuccess;
if(name == nullptr || hmod == 0){
if(name == nullptr){
return hipErrorInvalidValue;
}
if(ctx == nullptr){
ret = hipErrorInvalidContext;
}else{
*func = new ihipFunction_t;
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
hsa_status_t status;
hsa_executable_symbol_t kernel_symbol;
hsa_executable_t executable;
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
status = hsa_executable_load_code_object(hmod->executable, gpuAgent, hmod->object, NULL);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotInitialized;
}
hsa_code_object_t obj;
obj.handle = hmod;
status = hsa_executable_load_code_object(executable, gpuAgent, obj, NULL);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotInitialized;
}
status = hsa_executable_freeze(executable, NULL);
status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
status = hsa_executable_freeze(hmod->executable, NULL);
status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &(*func)->kernel_symbol);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotFound;
}
status = hsa_executable_symbol_get_info(kernel_symbol,
status = hsa_executable_symbol_get_info((*func)->kernel_symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
func);
&(*func)->kernel);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotFound;
}
@@ -158,16 +209,24 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n
return ret;
}
hipError_t hipLaunchModuleKernel(hipFunction f,
hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
const char *name){
HIP_INIT_API(name);
return ihipModuleGetFunction(hfunc, hmod, name);
}
hipError_t hipModuleLaunchKernel(hipFunction_t f,
uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra){
HIP_INIT_API(f);
HIP_INIT_API(f->kernel);
auto ctx = ihipGetTlsDefaultCtx();
hipError_t ret = hipSuccess;
if(ctx == nullptr){
ret = hipErrorInvalidDevice;
}else{
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
@@ -189,66 +248,100 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
/*
Kernel argument preparation.
*/
hsa_region_t kernArg;
hsa_status_t status = hsa_agent_iterate_regions(gpuAgent, hipdrv::findKernArgRegions, &kernArg);
void *kern;
status = hsa_memory_allocate(kernArg, kernSize, &kern);
if(status != HSA_STATUS_SUCCESS){
return hipErrorLaunchOutOfResources;
}
memcpy(kern, config[1], kernSize);
hsa_status_t status;
grid_launch_parm lp;
hStream = ihipPreLaunchKernel(hStream, 0, 0, &lp);
/*
Pre kernel launch
stream = ihipSyncAndResolveStream(stream);
stream->lockopen_preKernelCommand();
hc::accelerator_view av = &stream->_av;
hc::completion_future cf = new hc::completion_future;
Create signal
*/
hStream = ihipSyncAndResolveStream(hStream);
hc::accelerator_view *av = &hStream->_av;
hsa_queue_t *Queue = (hsa_queue_t*)av->get_hsa_queue();
hsa_signal_t signal;
status = hsa_signal_create(1, 0, NULL, &signal);
/*
Creating the packets
Launch AQL packet
*/
hStream->launchModuleKernel(signal, blockDimX, blockDimY, blockDimZ,
gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f->kernel);
/*
Wait for signal
*/
const uint32_t queue_mask = Queue->size-1;
uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
dispatch_packet->completion_signal = signal;
dispatch_packet->workgroup_size_x = blockDimX;
dispatch_packet->workgroup_size_y = blockDimY;
dispatch_packet->workgroup_size_z = blockDimZ;
dispatch_packet->grid_size_x = blockDimX * gridDimX;
dispatch_packet->grid_size_y = blockDimY * gridDimY;
dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
dispatch_packet->group_segment_size = 0;
dispatch_packet->private_segment_size = sharedMemBytes;
dispatch_packet->kernarg_address = kern;
dispatch_packet->kernel_object = f;
uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
uint32_t header32 = header | (setup << 16);
__atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
ihipPostLaunchKernel(hStream, lp);
}
return ret;
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
hipModule_t hmod, const char* name){
HIP_INIT_API(name);
hipError_t ret = hipSuccess;
if(dptr == NULL || bytes == NULL){
return hipErrorInvalidValue;
}
if(name == NULL || hmod == NULL){
return hipErrorNotInitialized;
}
else{
hipFunction_t func;
ihipModuleGetFunction(&func, hmod, name);
*bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t);
*dptr = reinterpret_cast<void*>(func->kernel);
return ret;
}
}
hipError_t hipModuleLoadData(hipModule_t *module, const void *image){
HIP_INIT_API(image);
hipError_t ret = hipSuccess;
if(image == NULL || module == NULL){
return hipErrorNotInitialized;
}else{
auto ctx = ihipGetTlsDefaultCtx();
*module = new ihipModule_t;
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
void *p;
uint64_t size = ElfSize(image);
hsa_agent_t agent = currentDevice->_hsaAgent;
hsa_region_t sysRegion;
hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
status = hsa_memory_allocate(sysRegion, size, (void**)&p);
if(status != HSA_STATUS_SUCCESS){
return hipErrorOutOfMemory;
}
char *ptr = (char*)p;
if(!ptr){
return hipErrorOutOfMemory;
}
(*module)->ptr = p;
(*module)->size = size;
memcpy(ptr, image, size);
status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
if(status != HSA_STATUS_SUCCESS){
return hipErrorSharedObjectInitFailed;
}
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotInitialized;
}
}
return ret;
}
+14 -4
Fájl megtekintése
@@ -67,7 +67,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_
//---
// Disable visibility of this device into memory allocated on peer device.
// Remove this device from peer device peerlist.
hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx)
{
HIP_INIT_API(peerCtx);
@@ -109,7 +109,7 @@ hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
//---
// Allow the current device to see all memory allocated on peerDevice.
// This should add this device to the peer-device peer list.
hipError_t hipDeviceEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
{
HIP_INIT_API(peerCtx, flags);
@@ -175,7 +175,7 @@ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId)
{
HIP_INIT_API(peerDeviceId);
return hipDeviceDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
return ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
}
@@ -183,7 +183,7 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags)
{
HIP_INIT_API(peerDeviceId, flags);
return hipDeviceEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
return ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
}
@@ -200,6 +200,16 @@ hipError_t hipMemcpyPeerAsync (void* dst, int dstDevice, const void* src, int
return hipMemcpyPeerAsync(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes, stream);
}
hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
{
HIP_INIT_API(peerCtx, flags);
return ihipEnablePeerAccess(peerCtx, flags);
}
hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx)
{
HIP_INIT_API(peerCtx);
return ihipDisablePeerAccess(peerCtx);
}
+53
Fájl megtekintése
@@ -0,0 +1,53 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include<iostream>
#include<hip/hip_runtime.h>
#include<hip/hip_runtime_api.h>
#include<hip/hcc_detail/hipComplex.h>
#define LEN 64
#define SIZE 64<<2
__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
}
int main(){
float *A, *Ad, *B, *Bd, *C, *Cd;
A = new float[LEN];
B = new float[LEN];
C = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = i*1.0f;
C[i] = i*1.0f;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
}
+42
Fájl megtekintése
@@ -0,0 +1,42 @@
#include<iostream>
#include<hip/hip_runtime.h>
#include<hip/hip_runtime_api.h>
#define LEN 1030
#define SIZE LEN << 2
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
{
memcpy(Out, In, SIZE, Vald);
}
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
memset(ptr, val, size);
}
int main()
{
uint32_t *A, *Ad, *B, *Bd;
uint32_t *Val, *Vald;
A = new uint32_t[LEN];
B = new uint32_t[LEN];
Val = new uint32_t;
*Val = 0;
for(int i=0;i<LEN;i++){
A[i] = i *1.0f;
B[i] = 0.0f;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Vald, sizeof(uint32_t));
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
for(int i=LEN-16;i<LEN;i++){
std::cout<<A[i]<<" "<<B[i]<<std::endl;
}
std::cout<<*Val<<std::endl;
}
+26
Fájl megtekintése
@@ -0,0 +1,26 @@
#include<iostream>
#include<hip/hip_runtime.h>
#include<hip/hip_runtime_api.h>
#define LEN 1024
#define SIZE LEN<<2
int main(){
int *A, *B, *C;
hipDeviceptr Ad, Bd;
A = new int[LEN];
B = new int[LEN];
C = new int[LEN];
for(int i=0;i<LEN;i++){
A[i] = i;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpyHtoD(Ad, A, SIZE);
hipMemcpyDtoD(Bd, Ad, SIZE);
hipMemcpyDtoH(B, Bd, SIZE);
hipMemcpyHtoH(C, B,SIZE);
for(int i=0;i<16;i++){
std::cout<<A[i]<<" "<<C[i]<<std::endl;
}
}
+85
Fájl megtekintése
@@ -0,0 +1,85 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include<hip_runtime.h>
#include<hip_runtime_api.h>
#include<iostream>
#include<fstream>
#include<vector>
#define LEN 64
#define SIZE LEN<<2
#define fileName "vcpy_isa.co"
#define kernel_name "hello_world"
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
int tx = hipThreadIdx_x;
Bd[tx] = Ad[tx];
}
int main(){
float *A, *B, *Ad, *Bd;
A = new float[LEN];
B = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 0.0f;
std::cout<<A[i] << " "<<B[i]<<std::endl;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipModule_t Module;
hipFunction_t Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
hipStream_t stream;
hipStreamCreate(&stream);
void *args[2] = {&Ad, &Bd};
std::vector<void*>argBuffer(2);
memcpy(&argBuffer[0], &Ad, sizeof(void*));
memcpy(&argBuffer[1], &Bd, sizeof(void*));
size_t size = argBuffer.size()*sizeof(void*);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END
};
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config);
hipStreamDestroy(stream);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
for(uint32_t i=0;i<LEN;i++){
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
}
return 0;
}
+31
Fájl megtekintése
@@ -0,0 +1,31 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include<hip/hip_runtime.h>
#include<hip/hip_runtime_api.h>
#include<iostream>
#define fileName "vcpy_isa.co"
int main(){
hipModule_t module;
hipModuleLoad(&module, fileName);
hipModuleUnload(module);
}
+94
Fájl megtekintése
@@ -0,0 +1,94 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include<hip_runtime.h>
#include<hip_runtime_api.h>
#include<iostream>
#include<fstream>
#include<vector>
#define LEN 64
#define SIZE LEN<<2
#ifdef __HIP_PLATFORM_HCC__
#define fileName "vcpy_isa.co"
#endif
#ifdef __HIP_PLATFORM_NVCC__
#define fileName "vcpy_isa.ptx"
#endif
#define kernel_name "hello_world"
int main(){
float *A, *B;
hipDeviceptr_t Ad, Bd;
A = new float[LEN];
B = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 0.0f;
std::cout<<A[i] << " "<<B[i]<<std::endl;
}
#ifdef __HIP_PLATFORM_NVCC__
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipCtxCreate(&context, 0, device);
#endif
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpyHtoD(Ad, A, SIZE);
hipMemcpyHtoD(Bd, B, SIZE);
hipModule_t Module;
hipFunction_t Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
std::vector<void*>argBuffer(2);
memcpy(&argBuffer[0], &Ad, sizeof(void*));
memcpy(&argBuffer[1], &Bd, sizeof(void*));
size_t size = argBuffer.size()*sizeof(void*);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END
};
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
hipMemcpyDtoH(B, Bd, SIZE);
for(uint32_t i=0;i<LEN;i++){
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
}
#ifdef __HIP_PLATFORM_NVCC__
hipCtxDetach(context);
#endif
return 0;
}
BINáris
Fájl megtekintése
Binary file not shown.
+6
Fájl megtekintése
@@ -0,0 +1,6 @@
extern "C" __global__ void hello_world(float *a, float *b)
{
int tx = threadIdx.x;
b[tx] = a[tx];
}
+38
Fájl megtekintése
@@ -0,0 +1,38 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//
.version 4.3
.target sm_20
.address_size 64
// .globl hello_world
.visible .entry hello_world(
.param .u64 hello_world_param_0,
.param .u64 hello_world_param_1
)
{
.reg .f32 %f<2>;
.reg .b32 %r<2>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [hello_world_param_0];
ld.param.u64 %rd2, [hello_world_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
add.s64 %rd7, %rd3, %rd5;
st.global.f32 [%rd7], %f1;
ret;
}