Merge branch 'rocm-rel-1.5'

Change-Id: Ib2318f9c0d01a1bc8be2fcb172a3075e82851877


[ROCm/hip-tests commit: fa64db5171]
This commit is contained in:
Maneesh Gupta
2017-05-02 09:06:49 +05:30
bovenliggende 9ab3b01ba6 182d5261b3
commit b1d90234a8
87 gewijzigde bestanden met toevoegingen van 1028 en 371 verwijderingen
@@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc
ifeq (${HIP_PLATFORM}, nvcc)
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
endif
ifeq (${HIP_PLATFORM}, hcc)
HIPCC_FLAGS = -stdlib=libc++
endif
EXE=bit_extract
@@ -24,4 +20,3 @@ $(EXE): bit_extract.cpp
clean:
rm -f *.o $(EXE)
@@ -37,7 +37,7 @@ THE SOFTWARE.
}\
}
void __global__
__global__ void
bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
@@ -45,7 +45,7 @@ bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t
for (size_t i=offset; i<N; i+=stride) {
#ifdef __HIP_PLATFORM_HCC__
C_d[i] = hc::__bitextract_u32(A_d[i], 8, 4);
C_d[i] = hc::__bitextract_u32(A_d[i], 8, 4);
#else /* defined __HIP_PLATFORM_NVCC__ or other path */
C_d[i] = ((A_d[i] & 0xf00) >> 8);
#endif
@@ -73,7 +73,7 @@ int main(int argc, char *argv[])
C_h = (uint32_t*)malloc(Nbytes);
CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
for (size_t i=0; i<N; i++)
for (size_t i=0; i<N; i++)
{
A_h[i] = i;
}
@@ -5,14 +5,19 @@ endif
HIPCC=$(HIP_PATH)/bin/hipcc
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
all: vcpy_kernel.code runKernel.hip.out
all: vcpy_kernel.code runKernel.hip.out defaultDriver.hip.out
runKernel.hip.out: runKernel.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
launchKernelHcc.hip.out: launchKernelHcc.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
defaultDriver.hip.out: defaultDriver.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
vcpy_kernel.code: vcpy_kernel.cpp
$(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@
$(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@
clean:
rm -f *.code *.out
@@ -0,0 +1,89 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include<iostream>
#include<fstream>
#include<vector>
#define LEN 64
#define SIZE LEN<<2
#define fileName "test.co"
#define kernel_name "vadd"
int main(){
float *A, *B, *C;
hipDeviceptr_t Ad, Bd, Cd;
A = new float[LEN];
B = new float[LEN];
C = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 1.0f;
C[i] = 0.0f;
}
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipCtxCreate(&context, 0, device);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMemcpyHtoD(Ad, A, SIZE);
hipMemcpyHtoD(Bd, B, SIZE);
hipMemcpyHtoD(Cd, C, SIZE);
hipModule_t Module;
hipFunction_t Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
int n = LEN;
void * args[4] = {&Ad, &Bd, &Cd, &n};
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, args, nullptr);
hipMemcpyDtoH(C, Cd, SIZE);
int mismatchCount = 0;
for(uint32_t i=0;i<LEN;i++){
if (A[i] + B[i] != C[i]) {
mismatchCount++;
std::cout<<"error: mismatch " << A[i]<<" + "<<B[i]<<" != "<<C[i]<<std::endl;
}
}
if (mismatchCount == 0) {
std::cout << "PASSED!\n";
} else {
std::cout << "FAILED!\n";
};
hipCtxDestroy(context);
return 0;
}
@@ -0,0 +1,112 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include <iostream>
#include <fstream>
#include <vector>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_hcc.h>
#endif
#define LEN 64
#define SIZE LEN<<2
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
#define HIP_CHECK(status) \
if(status != hipSuccess) {std::cout<<"Got Status: "<<status<<" at Line: "<<__LINE__<<std::endl;exit(0);}
int main(){
float *A, *B;
hipDeviceptr_t Ad, Bd;
A = new float[LEN];
B = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 0.0f;
}
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipCtxCreate(&context, 0, device);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpyHtoD(Ad, A, SIZE);
hipMemcpyHtoD(Bd, B, SIZE);
hipModule_t Module;
hipFunction_t Function;
HIP_CHECK(hipModuleLoad(&Module, fileName));
HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
uint32_t len = LEN;
uint32_t one = 1;
struct {
void * _Ad;
void * _Bd;
} args;
args._Ad = Ad;
args._Bd = Bd;
size_t size = sizeof(args);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END
};
HIP_CHECK(hipHccModuleLaunchKernel(Function, LEN, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config));
hipMemcpyDtoH(B, Bd, SIZE);
int mismatchCount = 0;
for(uint32_t i=0;i<LEN;i++){
if (A[i] != B[i]) {
mismatchCount++;
std::cout<<"error: mismatch " << A[i]<<" != "<<B[i]<<std::endl;
}
}
if (mismatchCount == 0) {
std::cout << "PASSED!\n";
} else {
std::cout << "FAILED!\n";
};
hipCtxDestroy(context);
return 0;
}
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -22,9 +22,10 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include<iostream>
#include<fstream>
#include<vector>
#include <iostream>
#include <fstream>
#include <vector>
#include <hip/hip_hcc.h>
#define LEN 64
#define SIZE LEN<<2
@@ -32,6 +33,9 @@ THE SOFTWARE.
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
#define HIP_CHECK(status) \
if(status != hipSuccess) {std::cout<<"Got Status: "<<status<<" at Line: "<<__LINE__<<std::endl;exit(0);}
int main(){
float *A, *B;
hipDeviceptr_t Ad, Bd;
@@ -43,10 +47,10 @@ int main(){
B[i] = 0.0f;
}
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipCtxCreate(&context, 0, device);
hipMalloc((void**)&Ad, SIZE);
@@ -56,22 +60,18 @@ int main(){
hipMemcpyHtoD(Bd, B, SIZE);
hipModule_t Module;
hipFunction_t Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
HIP_CHECK(hipModuleLoad(&Module, fileName));
HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
#ifdef __HIP_PLATFORM_HCC__
uint32_t len = LEN;
uint32_t one = 1;
struct {
uint32_t _hidden[6];
void * _Ad;
void * _Bd;
} args;
for (int i=0; i<6; i++) {
args._hidden[i] = 0;
}
args._Ad = Ad;
args._Bd = Bd;
@@ -98,9 +98,10 @@ int main(){
HIP_LAUNCH_PARAM_END
};
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
HIP_CHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config));
hipMemcpyDtoH(B, Bd, SIZE);
int mismatchCount = 0;
for(uint32_t i=0;i<LEN;i++){
if (A[i] != B[i]) {
@@ -0,0 +1,12 @@
__kernel void memset(char in, __global int* out) {
int tx = get_global_id(0);
out[tx] = in;
}
__kernel void vadd(__global float *Ad, __global float *Bd, __global float *Cd, int N){
int tx = get_global_id(0);
if(tx < N){
Cd[tx] = Ad[tx] + Bd[tx];
}
}
Binair bestand niet weergegeven.
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -27,4 +27,3 @@ extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
int tx = hipThreadIdx_x;
b[tx] = a[tx];
}
@@ -15,5 +15,6 @@ square.hip.out: square.hipref.cpp
clean:
rm -f *.o *.out
@@ -32,7 +32,7 @@ THE SOFTWARE.
}\
}
/*
/*
* Square each element in the array A and write to array C.
*/
template <typename T>
@@ -58,16 +58,18 @@ int main(int argc, char *argv[])
hipDeviceProp_t props;
CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
#ifdef __HIP_PLATFORM_HCC__
printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
#endif
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
A_h = (float*)malloc(Nbytes);
CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
C_h = (float*)malloc(Nbytes);
CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
// Fill with Phi + i
for (size_t i=0; i<N; i++)
for (size_t i=0; i<N; i++)
{
A_h[i] = 1.618f + i;
A_h[i] = 1.618f + i;
}
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
@@ -81,7 +83,7 @@ int main(int argc, char *argv[])
const unsigned threadsPerBlock = 256;
printf ("info: launch 'vector_square' kernel\n");
hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
printf ("info: copy Device2Host\n");
CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -6,9 +6,12 @@
#include "ResultDatabase.h"
enum MallocMode {MallocPinned, MallocUnpinned, MallocRegistered};
// Cmdline parms:
bool p_verbose = false;
bool p_pinned = true;
MallocMode p_malloc_mode = MallocPinned;
int p_numa_ctl = -1;
int p_iterations = 10;
int p_beatsperiteration=1;
int p_device = 0;
@@ -21,7 +24,7 @@ bool p_h2d = true;
bool p_d2h = true;
bool p_bidir = true;
//#define NO_CHECK
#define CHECK_HIP_ERROR() \
@@ -36,6 +39,14 @@ bool p_bidir = true;
}
std::string mallocModeString(int mallocMode) {
switch (mallocMode) {
case MallocPinned : return "pinned";
case MallocUnpinned: return "unpinned";
case MallocRegistered: return "registered";
default: return "mallocmode-UNKNOWN";
};
};
// ****************************************************************************
int sizeToBytes(int size) {
@@ -106,7 +117,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
// Create some host memory pattern
float *hostMem = NULL;
if (p_pinned)
if (p_malloc_mode == MallocPinned)
{
hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
while (hipGetLastError() != hipSuccess)
@@ -116,20 +127,33 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
}
}
else
else if (p_malloc_mode == MallocUnpinned)
{
if (p_alignedhost) {
hostMem = (float*)aligned_alloc(p_alignedhost, numMaxFloats*sizeof(float));
} else {
hostMem = new float[numMaxFloats];
}
}
else if (p_malloc_mode == MallocRegistered)
{
if (p_numa_ctl == -1) {
hostMem = (float*)malloc(numMaxFloats*sizeof(float));
}
hipHostRegister(hostMem, numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
}
else
{
assert(0);
}
for (int i = 0; i < numMaxFloats; i++)
@@ -146,7 +170,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any device buffer\n";
std::cerr << "Error: Couldn't allocate any device buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -199,8 +223,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("H2D_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
if (p_onesize) {
break;
@@ -212,6 +236,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
}
#ifndef NO_CHECK
// Check. First reset the host memory, then copy-back result. Then compare against original ref value.
for (int i = 0; i < numMaxFloats; i++)
{
@@ -225,24 +251,36 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
printf ("error: H2D. i=%d reference:%6.f != copyback:%6.2f\n", i, ref, hostMem[i]);
}
}
#endif
// Cleanup
hipFree((void*)device);
CHECK_HIP_ERROR();
if (p_pinned)
{
switch (p_malloc_mode) {
case MallocPinned:
hipHostFree((void*)hostMem);
CHECK_HIP_ERROR();
}
else
{
break;
case MallocUnpinned:
if (p_alignedhost) {
delete[] hostMem;
} else {
free(hostMem);
}
break;
case MallocRegistered:
hipHostUnregister(hostMem);
CHECK_HIP_ERROR();
free(hostMem);
break;
default:
assert(0);
}
hipEventDestroy(start);
hipEventDestroy(stop);
}
@@ -257,38 +295,56 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
// Create some host memory pattern
float *hostMem1;
float *hostMem2;
if (p_pinned)
if (p_malloc_mode == MallocPinned)
{
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
hipError_t err1 = hipGetLastError();
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
hipError_t err2 = hipGetLastError();
while (err1 != hipSuccess || err2 != hipSuccess)
{
// free the first buffer if only the second failed
if (err1 == hipSuccess)
hipHostFree((void*)hostMem1);
while (err1 != hipSuccess || err2 != hipSuccess)
{
// free the first buffer if only the second failed
if (err1 == hipSuccess)
hipHostFree((void*)hostMem1);
// drop the size and try again
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
err1 = hipGetLastError();
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
err2 = hipGetLastError();
}
}
else
// drop the size and try again
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
err1 = hipGetLastError();
hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
err2 = hipGetLastError();
}
}
else if (p_malloc_mode == MallocUnpinned)
{
hostMem1 = new float[numMaxFloats];
hostMem2 = new float[numMaxFloats];
}
else if (p_malloc_mode == MallocRegistered)
{
if (p_numa_ctl == -1) {
hostMem1 = (float*)malloc(numMaxFloats*sizeof(float));
hostMem2 = (float*)malloc(numMaxFloats*sizeof(float));
}
hipHostRegister(hostMem1, numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
hipHostRegister(hostMem2, numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
}
else
{
assert(0);
}
for (int i=0; i<numMaxFloats; i++)
hostMem1[i] = i % 77;
@@ -301,7 +357,7 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any device buffer\n";
std::cerr << "Error: Couldn't allocate any device buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -358,8 +414,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("D2H_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
resultDB.AddResult(std::string("D2H_Bandwidth") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "ms", t);
if (p_onesize) {
break;
}
@@ -381,20 +437,31 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
// Cleanup
hipFree((void*)device);
CHECK_HIP_ERROR();
if (p_pinned)
{
switch (p_malloc_mode) {
case MallocPinned:
hipHostFree((void*)hostMem1);
CHECK_HIP_ERROR();
hipHostFree((void*)hostMem2);
CHECK_HIP_ERROR();
}
else
{
break;
case MallocUnpinned:
delete[] hostMem1;
delete[] hostMem2;
hipEventDestroy(start);
hipEventDestroy(stop);
break;
case MallocRegistered:
hipHostUnregister(hostMem1);
CHECK_HIP_ERROR();
free(hostMem1);
hipHostUnregister(hostMem2);
free(hostMem2);
break;
default:
assert(0);
}
hipEventDestroy(start);
hipEventDestroy(stop);
}
@@ -409,7 +476,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
// Create some host memory pattern
float *hostMem[2] = {NULL, NULL};
if (p_pinned)
if (p_malloc_mode == MallocPinned)
{
while (1)
{
@@ -424,18 +491,34 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
std::cerr << "Error: Couldn't allocate any pinned buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
}
}
}
else
else if (p_malloc_mode == MallocUnpinned)
{
hostMem[0] = new float[numMaxFloats];
hostMem[1] = new float[numMaxFloats];
}
else if (p_malloc_mode == MallocRegistered)
{
if (p_numa_ctl == -1) {
hostMem[0] = (float*)malloc(numMaxFloats*sizeof(float));
hostMem[1] = (float*)malloc(numMaxFloats*sizeof(float));
}
hipHostRegister(hostMem[0], numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
hipHostRegister(hostMem[1], numMaxFloats * sizeof(float), 0);
CHECK_HIP_ERROR();
}
else
{
assert(0);
}
for (int i = 0; i < numMaxFloats; i++)
{
@@ -459,7 +542,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
--nSizes;
if (nSizes < 1)
{
std::cerr << "Error: Couldn't allocated any device buffer\n";
std::cerr << "Error: Couldn't allocate any device buffer\n";
return;
}
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -512,8 +595,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
double speed = (double(sizeToBytes(thisSize)) / (1000*1000)) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
resultDB.AddResult(std::string("Bidir_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
resultDB.AddResult(std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
}
}
@@ -521,17 +604,27 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
hipFree((void*)deviceMem[0]);
hipFree((void*)deviceMem[1]);
CHECK_HIP_ERROR();
if (p_pinned)
{
switch (p_malloc_mode) {
case MallocPinned:
hipHostFree((void*)hostMem[0]);
hipHostFree((void*)hostMem[1]);
CHECK_HIP_ERROR();
}
else
{
break;
case MallocUnpinned:
delete[] hostMem[0];
delete[] hostMem[1];
}
break;
case MallocRegistered:
for (int i=0; i<2; i++) {
hipHostUnregister(hostMem[i]);
CHECK_HIP_ERROR();
free(hostMem[i]);
}
break;
default:
assert(0);
};
hipEventDestroy(start);
hipEventDestroy(stop);
hipStreamDestroy(stream[0]);
@@ -557,7 +650,7 @@ void printConfig() {
hipDeviceProp_t props;
hipGetDeviceProperties(&props, p_device);
printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz Pinned=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, p_pinned ? "YES" : "NO");
printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz MallocMode=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, mallocModeString(p_malloc_mode).c_str());
}
void help() {
@@ -601,7 +694,9 @@ int parseStandardArguments(int argc, char *argv[])
failed("Bad onesize argument");
}
} else if (!strcmp(arg, "--unpinned")) {
p_pinned = 0;
p_malloc_mode = MallocUnpinned;
} else if (!strcmp(arg, "--registered")) {
p_malloc_mode = MallocRegistered;
} else if (!strcmp(arg, "--h2d")) {
p_h2d = true;
p_d2h = false;
@@ -10,9 +10,6 @@ OPT=-O3
CXXFLAGS = $(OPT) --std=c++11
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
ifeq (${HIP_PLATFORM}, hcc)
CXXFLAGS += " -stdlib=libc++"
endif
CODE_OBJECTS=nullkernel.hsaco
@@ -1,50 +0,0 @@
_ Add AQL kernel.
_ Fix &*kernel command so the kernel name/type is an argument not a new command.
_ Add command to parse only.
_ Add regression to parse all the hcm files.
_ Partition HCC, HIP, HSA, OpenCL commands into separate files.
_ Show time for back-to-back copies.
_ Add variables.
%loopcnt
./hipCommander %loopcnt=4
_ Add datasize command.
_ Add ( ) to parsing.
_ Add argument parsing and checking.
_ Add verbose option to print each step of setup.
- print deliniater between setup and run. Add run start message.
- print sizes of all buffers.
- print each command before running.
- show start/stop of timer routine.
_
_ Clear documentation on what each oepration does.
_ Add time instrumentation for each command.
_ Add pcie atomic.
_ Add tests for negative cases, ie endloop w/o opening loop.
README tips
---
- HIP_API_TRACE combined with -v is useful to track the exact commands generates by hipCommander.
Other ideas:
---
[ ] Perf guide : stream creation very slow on HCC and should be avoided.
Scratch:
@@ -11,7 +11,6 @@
#include <elf.h>
#include <hsa/hsa.h>
#include <hc.hpp>
#include <hip/hcc_detail/hcc_acc.h>
#endif
#include <sys/time.h>
@@ -23,7 +22,7 @@ bool g_printedTiming = false;
// Cmdline parms:
int p_device = 0;
const char* p_command = "H2D; NullKernel; D2H";
const char* p_command = "setstream(1); H2D; NullKernel; D2H;";
const char* p_file = nullptr;
unsigned p_verbose = 0x0;
unsigned p_db = 0x0;
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1);
loop(10); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1);
loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1);
loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); streamsync; streamsync; endloop(1);
loop(10); streamsync; streamsync; endloop(1);
loop(100); streamsync; streamsync; endloop(1);
loop(100); streamsync; streamsync; endloop(1);
loop(1000); streamsync; streamsync; endloop(1);
loop(1000); streamsync; streamsync; endloop(1);
loop(10000); streamsync; streamsync; endloop(1);
loop(10000); streamsync; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); D2H; streamsync; D2H; streamsync; endloop(1);
loop(10); D2H; streamsync; D2H; streamsync; endloop(1);
loop(100); D2H; streamsync; D2H; streamsync; endloop(1);
loop(100); D2H; streamsync; D2H; streamsync; endloop(1);
loop(1000); D2H;streamsync; D2H; streamsync; endloop(1);
loop(1000); D2H; streamsync; D2H; streamsync; endloop(1);
loop(1000); D2H; streamsync; D2H; streamsync; endloop(1);
loop(10000); D2H; streamsync; D2H; streamsync; endloop(1);
loop(10000); D2H; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); D2H; D2H; streamsync; endloop(1);
loop(10); D2H; D2H; streamsync; endloop(1);
loop(100); D2H; D2H; streamsync; endloop(1);
loop(100); D2H; D2H; streamsync; endloop(1);
loop(1000); D2H; D2H; streamsync; endloop(1);
loop(1000); D2H; D2H; streamsync; endloop(1);
loop(1000); D2H; D2H; streamsync; endloop(1);
loop(10000); D2H; D2H; streamsync; endloop(1);
loop(10000); D2H; D2H; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); H2D; streamsync; H2D; streamsync; endloop(1);
loop(10); H2D; streamsync; H2D; streamsync; endloop(1);
loop(100); H2D; streamsync; H2D; streamsync; endloop(1);
loop(100); H2D; streamsync; H2D; streamsync; endloop(1);
loop(1000); H2D;streamsync; H2D; streamsync; endloop(1);
loop(1000); H2D; streamsync; H2D; streamsync; endloop(1);
loop(1000); H2D; streamsync; H2D; streamsync; endloop(1);
loop(10000); H2D; streamsync; H2D; streamsync; endloop(1);
loop(10000); H2D; streamsync; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1);
loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1);
loop(10); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1);
loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1);
loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1);
loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D ; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); H2D; H2D; streamsync; endloop(1);
loop(10); H2D; H2D; streamsync; endloop(1);
loop(100); H2D; H2D; streamsync; endloop(1);
loop(100); H2D; H2D; streamsync; endloop(1);
loop(1000); H2D; H2D; streamsync; endloop(1);
loop(1000); H2D; H2D; streamsync; endloop(1);
loop(1000); H2D; H2D; streamsync; endloop(1);
loop(10000); H2D; H2D; streamsync; endloop(1);
loop(10000); H2D; H2D; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(10); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; NullKernel; streamsync; endloop(1);
loop(10); NullKernel; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(100); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(100); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10);streamsync; streamsync; streamsync; endloop(1);
loop(10);streamsync; streamsync; streamsync; endloop(1);
loop(100);streamsync; streamsync; streamsync; endloop(1);
loop(100);streamsync; streamsync; streamsync; endloop(1);
loop(1000);streamsync; streamsync; streamsync; endloop(1);
loop(1000);streamsync; streamsync; streamsync; endloop(1);
loop(10000);streamsync; streamsync; streamsync; endloop(1);
loop(10000);streamsync; streamsync; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
loop(10); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
loop(100); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
loop(100); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
loop(1000); D2H;streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
loop(1000); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
loop(10000); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
loop(10000); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; D2H; D2H; streamsync; endloop(1);
loop(10); D2H; D2H; D2H; streamsync; endloop(1);
loop(100); D2H; D2H; D2H; streamsync; endloop(1);
loop(100); D2H; D2H; D2H; streamsync; endloop(1);
loop(1000); D2H; D2H; D2H; streamsync; endloop(1);
loop(1000); D2H; D2H; D2H; streamsync; endloop(1);
loop(10000); D2H; D2H; D2H; streamsync; endloop(1);
loop(10000); D2H; D2H; D2H;streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); H2D; streamsync;H2D;streamsync; H2D; streamsync; endloop(1);
loop(10); H2D; streamsync;H2D;streamsync; H2D; streamsync; endloop(1);
loop(100); H2D; streamsync;H2D; streamsync;H2D; streamsync; endloop(1);
loop(100); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1);
loop(1000); H2D;streamsync; H2D;streamsync; H2D; streamsync; endloop(1);
loop(1000); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1);
loop(1000); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1);
loop(10000); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1);
loop(10000); H2D;streamsync; H2D;streamsync; H2D; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); H2D; H2D; H2D; streamsync; endloop(1);
loop(10); H2D; H2D; H2D; streamsync; endloop(1);
loop(100); H2D; H2D; H2D; streamsync; endloop(1);
loop(100); H2D; H2D; H2D; streamsync; endloop(1);
loop(1000); H2D; H2D; H2D; streamsync; endloop(1);
loop(1000); H2D; H2D; H2D; streamsync; endloop(1);
loop(1000); H2D; H2D; H2D; streamsync; endloop(1);
loop(10000); H2D; H2D; H2D; streamsync; endloop(1);
loop(10000); H2D; H2D; H2D; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(10); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; NullKernel; NullKernel; NullKernel;NullKernel; streamsync; endloop(1);
loop(10); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; NullKernel; NullKernel; NullKernel;NullKernel;NullKernel; streamsync; endloop(1);
loop(10); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;NullKernel;streamsync; endloop(1);
loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync; endloop(1);
loop(10); D2H; streamsync; endloop(1);
loop(100); D2H; streamsync; endloop(1);
loop(100); D2H; streamsync; endloop(1);
loop(1000); D2H; streamsync; endloop(1);
loop(1000); D2H; streamsync; endloop(1);
loop(10000); D2H; streamsync; endloop(1);
loop(10000); D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; H2D; streamsync; endloop(1);
loop(10); D2H; H2D; streamsync; endloop(1);
loop(100); D2H; H2D; streamsync; endloop(1);
loop(100); D2H; H2D; streamsync; endloop(1);
loop(1000); D2H; H2D; streamsync; endloop(1);
loop(1000); D2H; H2D; streamsync; endloop(1);
loop(10000); D2H; H2D; streamsync; endloop(1);
loop(10000); D2H; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; NullKernel; streamsync; endloop(1);
loop(10); D2H; NullKernel; streamsync; endloop(1);
loop(100); D2H; NullKernel; streamsync; endloop(1);
loop(100); D2H; NullKernel; streamsync; endloop(1);
loop(1000); D2H; NullKernel; streamsync; endloop(1);
loop(1000); D2H; NullKernel; streamsync; endloop(1);
loop(10000); D2H; NullKernel; streamsync; endloop(1);
loop(10000); D2H; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync; H2D; streamsync; endloop(1);
loop(10); D2H; streamsync; H2D; streamsync; endloop(1);
loop(100); D2H; streamsync; H2D; streamsync; endloop(1);
loop(100); D2H; streamsync; H2D; streamsync; endloop(1);
loop(1000); D2H; streamsync; H2D; streamsync; endloop(1);
loop(1000); D2H; streamsync; H2D; streamsync; endloop(1);
loop(10000); D2H; streamsync; H2D; streamsync; endloop(1);
loop(10000); D2H; streamsync; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); D2H; streamsync;NullKernel; streamsync; endloop(1);
loop(10); D2H; streamsync;NullKernel; streamsync; endloop(1);
loop(100); D2H; streamsync;NullKernel; streamsync; endloop(1);
loop(100); D2H; streamsync;NullKernel; streamsync; endloop(1);
loop(1000); D2H; streamsync;NullKernel; streamsync; endloop(1);
loop(1000); D2H; streamsync;NullKernel; streamsync; endloop(1);
loop(10000); D2H; streamsync;NullKernel; streamsync; endloop(1);
loop(10000); D2H; streamsync;NullKernel; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); H2D; streamsync; endloop(1);
loop(10); H2D; streamsync; endloop(1);
loop(100); H2D; streamsync; endloop(1);
loop(100); H2D; streamsync; endloop(1);
loop(1000); H2D; streamsync; endloop(1);
loop(1000); H2D; streamsync; endloop(1);
loop(1000); H2D; streamsync; endloop(1);
loop(10000); H2D; streamsync; endloop(1);
loop(10000); H2D; streamsync; endloop(1);
@@ -0,0 +1,2 @@
setstream(1);
loop(10); H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; D2H; streamsync; endloop(1);
loop(10); H2D; D2H; streamsync; endloop(1);
loop(100); H2D; D2H; streamsync; endloop(1);
loop(100); H2D; D2H; streamsync; endloop(1);
loop(1000); H2D; D2H; streamsync; endloop(1);
loop(1000); H2D; D2H; streamsync; endloop(1);
loop(10000); H2D; D2H; streamsync; endloop(1);
loop(10000); H2D; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; NullKernel; streamsync; endloop(1);
loop(10); H2D; NullKernel; streamsync; endloop(1);
loop(100); H2D; NullKernel; streamsync; endloop(1);
loop(100); H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10);H2D; streamsync; NullKernel; D2H; streamsync;endloop(1);
loop(10); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10);H2D; NullKernel; D2H; streamsync;endloop(1);
loop(10); H2D; NullKernel; D2H; streamsync; endloop(1);
loop(100); H2D; NullKernel; D2H; streamsync; endloop(1);
loop(100); H2D; NullKernel; D2H; streamsync; endloop(1);
loop(1000); H2D; NullKernel; D2H; streamsync; endloop(1);
loop(1000); H2D; NullKernel; D2H; streamsync; endloop(1);
loop(10000); H2D; NullKernel; D2H; streamsync; endloop(1);
loop(10000); H2D; NullKernel; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; NullKernel; streamsync; endloop(1);
loop(10); H2D; NullKernel; streamsync; endloop(1);
loop(100); H2D; NullKernel; streamsync; endloop(1);
loop(100); H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync; endloop(1);
loop(1000); H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D; NullKernel; streamsync; endloop(1);
loop(10000); H2D ; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; streamsync; D2H; streamsync; endloop(1);
loop(10); H2D; streamsync; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(10); H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10);H2D; streamsync; NullKernel;streamsync; D2H; streamsync;endloop(1);
loop(10); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,10 @@
setstream(1);
loop(10); NullKernel; streamsync; endloop(1);
loop(10); NullKernel; streamsync; endloop(1);
loop(100); NullKernel; streamsync; endloop(1);
loop(100); NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; streamsync; streamsync; endloop(1);
loop(10); NullKernel; streamsync; streamsync; endloop(1);
loop(100); NullKernel; streamsync; streamsync; endloop(1);
loop(100); NullKernel; streamsync; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; D2H; streamsync; endloop(1);
loop(10); NullKernel; D2H; streamsync; endloop(1);
loop(100); NullKernel; D2H; streamsync; endloop(1);
loop(100); NullKernel; D2H; streamsync; endloop(1);
loop(1000); NullKernel; D2H; streamsync; endloop(1);
loop(1000); NullKernel; D2H; streamsync; endloop(1);
loop(10000); NullKernel; D2H; streamsync; endloop(1);
loop(10000); NullKernel; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; H2D; streamsync; endloop(1);
loop(10); NullKernel; H2D; streamsync; endloop(1);
loop(100); NullKernel; H2D; streamsync; endloop(1);
loop(100); NullKernel; H2D; streamsync; endloop(1);
loop(1000); NullKernel; H2D; streamsync; endloop(1);
loop(1000); NullKernel; H2D; streamsync; endloop(1);
loop(10000); NullKernel; H2D; streamsync; endloop(1);
loop(10000); NullKernel; H2D; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10); NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(100); NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; D2H; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; D2H; streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10); NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(100); NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(100); NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(1000); NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; H2D; streamsync; endloop(1);
loop(10000); NullKernel; streamsync; H2D; streamsync; endloop(1);
@@ -0,0 +1,2 @@
setstream(1);
loop(10);setstream(1);setstream(2);setstream(3);setstream(4);setstream(5);streamsync; endloop(1);
@@ -0,0 +1,9 @@
setstream(1);
loop(10); streamsync; endloop(1);
loop(10); streamsync; endloop(1);
loop(100); streamsync; endloop(1);
loop(100); streamsync; endloop(1);
loop(1000); streamsync; endloop(1);
loop(1000); streamsync; endloop(1);
loop(10000); streamsync; endloop(1);
loop(10000); streamsync; endloop(1);
@@ -63,6 +63,14 @@ double bytesToGB(size_t s)
return (double)s / (1024.0*1024.0*1024.0);
}
#define printLimit(w1, limit, units) \
{\
size_t val;\
cudaDeviceGetLimit(&val, limit);\
std::cout << setw(w1) << #limit": " << val << " " << units << std::endl;\
}
void printDeviceProp (int deviceId)
{
using namespace std;
@@ -144,6 +152,17 @@ void printDeviceProp (int deviceId)
cout << endl;
#ifdef __HIP_PLATFORM_NVCC__
// Limits:
cout << endl;
printLimit(w1, cudaLimitStackSize, "bytes/thread");
printLimit(w1, cudaLimitPrintfFifoSize, "bytes/device");
printLimit(w1, cudaLimitMallocHeapSize, "bytes/device");
printLimit(w1, cudaLimitDevRuntimeSyncDepth, "grids");
printLimit(w1, cudaLimitDevRuntimePendingLaunchCount, "launches");
#endif
cout << endl;
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
$(error gfx701 is not a supported device for this sample)
endif
HIPCC=$(HIP_PATH)/bin/hipcc
TARGET=hcc
@@ -22,7 +26,7 @@ CXX=$(HIPCC)
$(EXECUTABLE): $(OBJECTS)
$(HIPCC) $(OBJECTS) -o $@
$(HIPCC) $(OBJECTS) -o $@
test: $(EXECUTABLE)
@@ -33,4 +37,3 @@ clean:
rm -f $(EXECUTABLE)
rm -f $(OBJECTS)
rm -f $(HIP_PATH)/src/*.o
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
$(error gfx701 is not a supported device for this sample)
endif
HIPCC=$(HIP_PATH)/bin/hipcc
TARGET=hcc
@@ -1,34 +0,0 @@
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
HIPCC_FLAGS += -std=c++11
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
ifeq (${HIP_PLATFORM}, nvcc)
LIBS = -lcublas
endif
ifeq (${HIP_PLATFORM}, hcc)
HCBLAS_ROOT?= $(wildcard /opt/rocm/hcblas)
HIPCC_FLAGS += -stdlib=libc++ -I$(HCBLAS_ROOT)/include
LIBS = -L$(HCBLAS_ROOT)/lib -lhipblas -rpath $(HIP_PATH)/lib
endif
all: saxpy.hipblas.out
saxpy.cublas.out : saxpy.cublas.cpp
nvcc -std=c++11 -I$(CUDA_HOME)/include saxpy.cublas.cpp -o $@ -L$(CUDA_HOME)/lib64 -lcublas
# $HIPBLAS_ROOT/bin/hipifyblas ./saxpy.cublas.cpp > ./saxpy.hipblas.cpp
# Then review & finish port in saxpy.hipblas.cpp
saxpy.hipblasref.o: saxpy.hipblasref.cpp
$(HIPCC) $(HIPCC_FLAGS) -c $< -o $@
saxpy.hipblas.out: saxpy.hipblasref.o
$(HIPCC) $< -o $@ $(LIBS)
clean:
rm -f *.o *.out
@@ -1,94 +0,0 @@
#include <random>
#include <algorithm>
#include <iostream>
#include <cmath>
// header file for the GPU API
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define N (1024 * 500)
#define CHECK(cmd) \
{\
cudaError_t error = cmd; \
if (error != cudaSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
#define CHECK_BLAS(cmd) \
{\
cublasStatus_t error = cmd;\
if (error != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
int main() {
const float a = 100.0f;
float x[N];
float y[N], y_cpu_res[N], y_gpu_res[N];
// initialize the input data
std::default_random_engine random_gen;
std::uniform_real_distribution<float> distribution(-N, N);
std::generate_n(x, N, [&]() { return distribution(random_gen); });
std::generate_n(y, N, [&]() { return distribution(random_gen); });
std::copy_n(y, N, y_cpu_res);
// Explicit GPU code:
size_t Nbytes = N*sizeof(float);
float *x_gpu, *y_gpu;
cublasHandle_t handle;
cudaDeviceProp props;
CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
CHECK(cudaMalloc(&x_gpu, Nbytes));
CHECK(cudaMalloc(&y_gpu, Nbytes));
// Initialize the blas library
CHECK_BLAS ( cublasCreate(&handle));
// copy n elements from a vector in host memory space to a vector in GPU memory space
printf ("info: copy Host2Device\n");
CHECK_BLAS ( cublasSetVector(N, sizeof(*x), x, 1, x_gpu, 1));
CHECK_BLAS ( cublasSetVector(N, sizeof(*y), y, 1, y_gpu, 1));
printf ("info: launch 'saxpy' kernel\n");
CHECK_BLAS ( cublasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1));
cudaDeviceSynchronize();
printf ("info: copy Device2Host\n");
CHECK_BLAS ( cublasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1));
// CPU implementation of saxpy
for (int i = 0; i < N; i++) {
y_cpu_res[i] = a * x[i] + y[i];
}
// verify the results
int errors = 0;
for (int i = 0; i < N; i++) {
if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f))
errors++;
}
std::cout << errors << " errors" << std::endl;
CHECK( cudaFree(x_gpu));
CHECK( cudaFree(y_gpu));
CHECK_BLAS( cublasDestroy(handle));
return errors;
}
@@ -1,94 +0,0 @@
#include <random>
#include <algorithm>
#include <iostream>
#include <cmath>
// header file for the GPU API
#include "hip/hip_runtime.h"
#include <hipblas.h>
#define N (1024 * 500)
#define CHECK(cmd) \
{\
hipError_t error = cmd; \
if (error != hipSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
#define CHECK_BLAS(cmd) \
{\
hipblasStatus_t error = cmd;\
if (error != HIPBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
int main() {
const float a = 100.0f;
float x[N];
float y[N], y_cpu_res[N], y_gpu_res[N];
// initialize the input data
std::default_random_engine random_gen;
std::uniform_real_distribution<float> distribution(-N, N);
std::generate_n(x, N, [&]() { return distribution(random_gen); });
std::generate_n(y, N, [&]() { return distribution(random_gen); });
std::copy_n(y, N, y_cpu_res);
// Explicit GPU code:
size_t Nbytes = N*sizeof(float);
float *x_gpu, *y_gpu;
hipblasHandle_t handle;
hipDeviceProp_t props;
CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
CHECK(hipMalloc(&x_gpu, Nbytes));
CHECK(hipMalloc(&y_gpu, Nbytes));
// Initialize the blas library
CHECK_BLAS ( hipblasCreate(&handle));
// copy n elements from a vector in host memory space to a vector in GPU memory space
printf ("info: copy Host2Device\n");
CHECK_BLAS ( hipblasSetVector(N, sizeof(*x), x, 1, x_gpu, 1));
CHECK_BLAS ( hipblasSetVector(N, sizeof(*y), y, 1, y_gpu, 1));
printf ("info: launch 'saxpy' kernel\n");
CHECK_BLAS ( hipblasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1));
hipDeviceSynchronize();
printf ("info: copy Device2Host\n");
CHECK_BLAS ( hipblasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1));
// CPU implementation of saxpy
for (int i = 0; i < N; i++) {
y_cpu_res[i] = a * x[i] + y[i];
}
// verify the results
int errors = 0;
for (int i = 0; i < N; i++) {
if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f))
errors++;
}
std::cout << errors << " errors" << std::endl;
CHECK( hipFree(x_gpu));
CHECK( hipFree(y_gpu));
CHECK_BLAS( hipblasDestroy(handle));
return errors;
}