From e22f2e1714f70fff751d2d81bf798337bbb984a3 Mon Sep 17 00:00:00 2001 From: Sarbojit Sarkar Date: Wed, 13 Apr 2022 06:35:25 +0000 Subject: [PATCH] SWDEV-323441 - support for default stream per thread Change-Id: I0032da0357f5cffbf5e4ec4a02435d2a128a262b [ROCm/clr commit: fc1f02bbed56b397fd978a7db0e72a08880fec41] --- .../hip/amd_detail/amd_hip_runtime_pt_api.h | 37 +++- projects/clr/hipamd/src/amdhip.def | 19 ++ projects/clr/hipamd/src/hip_event.cpp | 23 +- projects/clr/hipamd/src/hip_hcc.def.in | 19 ++ projects/clr/hipamd/src/hip_hcc.map.in | 25 +++ projects/clr/hipamd/src/hip_internal.hpp | 10 +- projects/clr/hipamd/src/hip_memory.cpp | 198 ++++++++++++++---- projects/clr/hipamd/src/hip_module.cpp | 60 +++++- projects/clr/hipamd/src/hip_stream.cpp | 182 +++++++++++----- 9 files changed, 455 insertions(+), 118 deletions(-) diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_runtime_pt_api.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_runtime_pt_api.h index 666a4c44b4..971af7ba92 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_runtime_pt_api.h @@ -34,6 +34,7 @@ THE SOFTWARE. #endif #if defined(__HIP_STREAM_PER_THREAD) + // Memory APIs #define hipMemcpy __HIP_API_SPT(hipMemcpy) #define hipMemcpyToSymbol __HIP_API_SPT(hipMemcpyToSymbol) #define hipMemcpyFromSymbol __HIP_API_SPT(hipMemcpyFromSymbol) @@ -44,10 +45,21 @@ THE SOFTWARE. #define hipMemset __HIP_API_SPT(hipMemset) #define hipMemset2D __HIP_API_SPT(hipMemset2D) #define hipMemset3D __HIP_API_SPT(hipMemset3D) + #define hipMemcpyAsync __HIP_API_SPT(hipMemcpyAsync) - #define hipMemcpyAsync __HIP_API_SPT(hipMemcpyAsync) - #define hipLaunchKernel __HIP_API_SPT(hipLaunchKernel) - #define hipStreamSynchronize __HIP_API_SPT(hipStreamSynchronize) + // Stream APIs + #define hipStreamSynchronize __HIP_API_SPT(hipStreamSynchronize) + #define hipStreamQuery __HIP_API_SPT(hipStreamQuery) + #define hipStreamGetFlags __HIP_API_SPT(hipStreamGetFlags) + #define hipStreamGetPriority __HIP_API_SPT(hipStreamGetPriority) + #define hipStreamWaitEvent __HIP_API_SPT(hipStreamWaitEvent) + + // Event APIs + #define hipEventRecord __HIP_API_SPT(hipEventRecord) + + // Launch APIs + #define hipLaunchKernel __HIP_API_SPT(hipLaunchKernel) + #define hipLaunchCooperativeKernel __HIP_API_SPT(hipLaunchCooperativeKernel) #endif hipError_t hipMemcpy_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); @@ -77,10 +89,29 @@ hipError_t hipMemset3D_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent ex hipError_t hipMemcpyAsync_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream); + +hipError_t hipStreamQuery_spt(hipStream_t stream); + hipError_t hipStreamSynchronize_spt(hipStream_t stream); + +hipError_t hipStreamGetPriority_spt(hipStream_t stream, int* priority); + +hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event, unsigned int flags); + +hipError_t hipStreamGetFlags_spt(hipStream_t stream, unsigned int* flags); + +hipError_t hipLaunchCooperativeKernel_spt(const void* f, + dim3 gridDim, dim3 blockDim, + void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream); +#ifdef __cplusplus +extern "C" { +#endif hipError_t hipLaunchKernel_spt(const void* function_address, dim3 numBlocks, dim3 dimBlocks, void** args, size_t sharedMemBytes, hipStream_t stream); +#ifdef __cplusplus +} +#endif // extern "C" #endif //HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H \ No newline at end of file diff --git a/projects/clr/hipamd/src/amdhip.def b/projects/clr/hipamd/src/amdhip.def index a1f7afdd29..c44f60d1c1 100644 --- a/projects/clr/hipamd/src/amdhip.def +++ b/projects/clr/hipamd/src/amdhip.def @@ -377,3 +377,22 @@ hipMemRelease hipMemRetainAllocationHandle hipMemSetAccess hipMemUnmap +hipMemcpy_spt +hipMemcpyAsync_spt +hipStreamSynchronize_spt +hipMemcpyToSymbol_spt +hipMemcpyFromSymbol_spt +hipMemcpy2D_spt +hipMemcpy2DToArray_spt +hipMemcpy2DFromArray_spt +hipMemcpy3D_spt +hipMemset_spt +hipMemset2D_spt +hipMemset3D_spt +hipStreamQuery_spt +hipStreamGetFlags_spt +hipStreamGetPriority_spt +hipStreamWaitEvent_spt +hipEventRecord_spt +hipLaunchKernel_spt +hipLaunchCooperativeKernel_spt diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 762e98e2f9..ab428f76f4 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -336,24 +336,29 @@ hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) { HIP_RETURN(eStart->elapsedTime(*eStop, *ms), "Elapsed Time = ", *ms); } -hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { - HIP_INIT_API(hipEventRecord, event, stream); - +hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) { STREAM_CAPTURE(hipEventRecord, stream, event); if (event == nullptr) { - HIP_RETURN(hipErrorInvalidHandle); + return hipErrorInvalidHandle; } - hip::Event* e = reinterpret_cast(event); - amd::HostQueue* queue = hip::getQueue(stream); - if (g_devices[e->deviceId()]->devices()[0] != &queue->device()) { - HIP_RETURN(hipErrorInvalidHandle); + return hipErrorInvalidHandle; } + return e->addMarker(stream, nullptr, true); +} - HIP_RETURN(e->addMarker(stream, nullptr, true)); +hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { + HIP_INIT_API(hipEventRecord, event, stream); + HIP_RETURN(hipEventRecord_common(event, stream)); +} + +hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream) { + HIP_INIT_API(hipEventRecord, event, stream); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN(hipEventRecord_common(event, stream)); } hipError_t hipEventSynchronize(hipEvent_t event) { diff --git a/projects/clr/hipamd/src/hip_hcc.def.in b/projects/clr/hipamd/src/hip_hcc.def.in index f12a5eff5a..4eba25dfd4 100644 --- a/projects/clr/hipamd/src/hip_hcc.def.in +++ b/projects/clr/hipamd/src/hip_hcc.def.in @@ -385,3 +385,22 @@ hipMemRelease hipMemRetainAllocationHandle hipMemSetAccess hipMemUnmap +hipMemcpy_spt +hipMemcpyAsync_spt +hipStreamSynchronize_spt +hipMemcpyToSymbol_spt +hipMemcpyFromSymbol_spt +hipMemcpy2D_spt +hipMemcpy2DToArray_spt +hipMemcpy2DFromArray_spt +hipMemcpy3D_spt +hipMemset_spt +hipMemset2D_spt +hipMemset3D_spt +hipStreamQuery_spt +hipStreamGetFlags_spt +hipStreamGetPriority_spt +hipStreamWaitEvent_spt +hipEventRecord_spt +hipLaunchKernel_spt +hipLaunchCooperativeKernel_spt diff --git a/projects/clr/hipamd/src/hip_hcc.map.in b/projects/clr/hipamd/src/hip_hcc.map.in index c3e1dea158..07897c75a8 100644 --- a/projects/clr/hipamd/src/hip_hcc.map.in +++ b/projects/clr/hipamd/src/hip_hcc.map.in @@ -434,3 +434,28 @@ global: local: *; } hip_5.0; + +hip_5.2 { +global: + hipMemcpy_spt; + hipMemcpyAsync_spt; + hipStreamSynchronize_spt; + hipMemcpyToSymbol_spt; + hipMemcpyFromSymbol_spt; + hipMemcpy2D_spt; + hipMemcpy2DToArray_spt; + hipMemcpy2DFromArray_spt; + hipMemcpy3D_spt; + hipMemset_spt; + hipMemset2D_spt; + hipMemset3D_spt; + hipStreamQuery_spt; + hipStreamGetFlags_spt; + hipStreamGetPriority_spt; + hipStreamWaitEvent_spt; + hipEventRecord_spt; + hipLaunchKernel_spt; + hipLaunchCooperativeKernel_spt; +local: + *; +} hip_5.1; \ No newline at end of file diff --git a/projects/clr/hipamd/src/hip_internal.hpp b/projects/clr/hipamd/src/hip_internal.hpp index 6e80cc2503..9c21406fca 100644 --- a/projects/clr/hipamd/src/hip_internal.hpp +++ b/projects/clr/hipamd/src/hip_internal.hpp @@ -171,7 +171,7 @@ static amd::Monitor g_hipInitlock{"hipInit lock"}; // Sync APIs cannot be called when stream capture is active #define CHECK_STREAM_CAPTURING() \ if (!g_captureStreams.empty()) { \ - HIP_RETURN(hipErrorStreamCaptureImplicit); \ + return hipErrorStreamCaptureImplicit; \ } #define STREAM_CAPTURE(name, stream, ...) \ @@ -180,7 +180,7 @@ static amd::Monitor g_hipInitlock{"hipInit lock"}; reinterpret_cast(stream)->GetCaptureStatus() == \ hipStreamCaptureStatusActive) { \ hipError_t status = capture##name(stream, ##__VA_ARGS__); \ - HIP_RETURN(status); \ + return status; \ } #define EVENT_CAPTURE(name, event, ...) \ @@ -189,6 +189,11 @@ static amd::Monitor g_hipInitlock{"hipInit lock"}; HIP_RETURN(status); \ } +#define PER_THREAD_DEFAULT_STREAM(stream) \ + if (stream == nullptr) { \ + stream = getPerThreadDefaultStream(); \ + } + namespace hc { class accelerator; class accelerator_view; @@ -484,6 +489,7 @@ extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags); extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset, size_t size = 0); extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size); extern void getStreamPerThread(hipStream_t& stream); +extern hipStream_t getPerThreadDefaultStream(); extern hipError_t ihipUnbindTexture(textureReference* texRef); extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device); diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 0f359eef0b..8b03dd7081 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -538,11 +538,27 @@ hipError_t hipFree(void* ptr) { HIP_RETURN(ihipFree(ptr)); } +hipError_t hipMemcpy_common(void* dst, const void* src, size_t sizeBytes, + hipMemcpyKind kind, hipStream_t stream = nullptr) { + CHECK_STREAM_CAPTURING(); + amd::HostQueue* queue = nullptr; + + if (stream != nullptr) { + queue = hip::getQueue(stream); + } else { + queue = hip::getNullStream(); + } + return ihipMemcpy(dst, src, sizeBytes, kind, *queue); +} + hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy, dst, src, sizeBytes, kind); - CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue)); + HIP_RETURN_DURATION(hipMemcpy_common(dst, src, sizeBytes, kind)); +} + +hipError_t hipMemcpy_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpy, dst, src, sizeBytes, kind); + HIP_RETURN_DURATION(hipMemcpy_common(dst, src, sizeBytes, kind, getPerThreadDefaultStream())); } hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, @@ -1073,9 +1089,8 @@ inline hipError_t ihipMemcpySymbol_validate(const void* symbol, size_t sizeBytes return hipSuccess; } -hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes, - size_t offset, hipMemcpyKind kind) { - HIP_INIT_API(hipMemcpyToSymbol, symbol, src, sizeBytes, offset, kind); +hipError_t hipMemcpyToSymbol_common(const void* symbol, const void* src, size_t sizeBytes, + size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) { CHECK_STREAM_CAPTURING(); size_t sym_size = 0; hipDeviceptr_t device_ptr = nullptr; @@ -1086,23 +1101,48 @@ hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeByt } /* Copy memory from source to destination address */ - HIP_RETURN_DURATION(hipMemcpy(device_ptr, src, sizeBytes, kind)); + return hipMemcpy_common(device_ptr, src, sizeBytes, kind, stream); +} + +hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes, + size_t offset, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpyToSymbol, symbol, src, sizeBytes, offset, kind); + HIP_RETURN_DURATION(hipMemcpyToSymbol_common(symbol, src, sizeBytes, offset, kind)); +} + +hipError_t hipMemcpyToSymbol_spt(const void* symbol, const void* src, size_t sizeBytes, + size_t offset, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpyToSymbol, symbol, src, sizeBytes, offset, kind); + HIP_RETURN_DURATION(hipMemcpyToSymbol_common(symbol, src, sizeBytes, offset, kind, + getPerThreadDefaultStream())); +} + +hipError_t hipMemcpyFromSymbol_common(void* dst, const void* symbol, size_t sizeBytes, + size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) { + CHECK_STREAM_CAPTURING(); + size_t sym_size = 0; + hipDeviceptr_t device_ptr = nullptr; + + hipError_t status = ihipMemcpySymbol_validate(symbol, sizeBytes, offset, sym_size, device_ptr); + if (status != hipSuccess) { + return status; + } + + /* Copy memory from source to destination address */ + return hipMemcpy_common(dst, device_ptr, sizeBytes, kind, stream); } hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes, size_t offset, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpyFromSymbol, symbol, dst, sizeBytes, offset, kind); - CHECK_STREAM_CAPTURING(); - size_t sym_size = 0; - hipDeviceptr_t device_ptr = nullptr; + HIP_RETURN_DURATION(hipMemcpyFromSymbol_common(dst, symbol, sizeBytes, offset, kind)); +} - hipError_t status = ihipMemcpySymbol_validate(symbol, sizeBytes, offset, sym_size, device_ptr); - if (status != hipSuccess) { - return status; - } - - /* Copy memory from source to destination address */ - HIP_RETURN_DURATION(hipMemcpy(dst, device_ptr, sizeBytes, kind)); +hipError_t hipMemcpyFromSymbol_spt(void* dst, const void* symbol, size_t sizeBytes, + size_t offset, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpyFromSymbol, symbol, dst, sizeBytes, offset, kind); + HIP_RETURN_DURATION(hipMemcpyFromSymbol_common(dst, symbol, sizeBytes, offset, kind, + getPerThreadDefaultStream())); } hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, @@ -1164,15 +1204,25 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr))); } -hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, +hipError_t hipMemcpyAsync_common(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_API(hipMemcpyAsync, dst, src, sizeBytes, kind, stream); - STREAM_CAPTURE(hipMemcpyAsync, stream, dst, src, sizeBytes, kind); amd::HostQueue* queue = hip::getQueue(stream); + return ihipMemcpy(dst, src, sizeBytes, kind, *queue, true); +} - HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true)); +hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, + hipMemcpyKind kind, hipStream_t stream) { + HIP_INIT_API(hipMemcpyAsync, dst, src, sizeBytes, kind, stream); + HIP_RETURN_DURATION(hipMemcpyAsync_common(dst, src, sizeBytes, kind, stream)); +} + +hipError_t hipMemcpyAsync_spt(void* dst, const void* src, size_t sizeBytes, + hipMemcpyKind kind, hipStream_t stream) { + HIP_INIT_API(hipMemcpyAsync, dst, src, sizeBytes, kind, stream); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN_DURATION(hipMemcpyAsync_common(dst, src, sizeBytes, kind, stream)); } hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, void* srcHost, size_t ByteCount, @@ -1965,11 +2015,23 @@ hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { HIP_RETURN_DURATION(ihipMemcpyParam2D(pCopy, nullptr)); } +hipError_t hipMemcpy2D_common(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + size_t height, hipMemcpyKind kind, hipStream_t stream = nullptr) { + CHECK_STREAM_CAPTURING(); + return ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream); +} + hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2D, dst, dpitch, src, spitch, width, height, kind); - CHECK_STREAM_CAPTURING(); - HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr)); + HIP_RETURN_DURATION(hipMemcpy2D_common(dst, dpitch, src, spitch, width, height, kind)); +} + +hipError_t hipMemcpy2D_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + size_t height, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpy2D, dst, dpitch, src, spitch, width, height, kind); + HIP_RETURN_DURATION(hipMemcpy2D_common(dst, dpitch, src, spitch, width, height, kind, + getPerThreadDefaultStream())); } hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, @@ -2008,14 +2070,25 @@ hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, c return ihipMemcpyParam2D(&desc, stream, isAsync); } -hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { - HIP_INIT_API(hipMemcpy2DToArray, dst, wOffset, hOffset, src, spitch, width, height, kind); +hipError_t hipMemcpy2DToArray_common(hipArray* dst, size_t wOffset, size_t hOffset, + const void* src, size_t spitch, size_t width, + size_t height, hipMemcpyKind kind, hipStream_t stream=nullptr) { CHECK_STREAM_CAPTURING(); if (spitch == 0) { HIP_RETURN(hipErrorInvalidPitchValue); } + return ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream); +} - HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr)); +hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpy2DToArray, dst, wOffset, hOffset, src, spitch, width, height, kind); + HIP_RETURN_DURATION(hipMemcpy2DToArray_common(dst, wOffset, hOffset, src, spitch, width, height, kind)); +} + +hipError_t hipMemcpy2DToArray_spt(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpy2DToArray, dst, wOffset, hOffset, src, spitch, width, height, kind); + HIP_RETURN_DURATION(hipMemcpy2DToArray_common(dst, wOffset, hOffset, src, spitch, + width, height, kind, getPerThreadDefaultStream())); } hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) { @@ -2217,10 +2290,19 @@ hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipStream_t stream, bool isAs return ihipMemcpyParam3D(&desc, stream, isAsync); } +hipError_t hipMemcpy3D_common(const hipMemcpy3DParms* p, hipStream_t stream = nullptr) { + CHECK_STREAM_CAPTURING(); + return ihipMemcpy3D(p, stream); +} + hipError_t hipMemcpy3D(const hipMemcpy3DParms* p) { HIP_INIT_API(hipMemcpy3D, p); - CHECK_STREAM_CAPTURING(); - HIP_RETURN_DURATION(ihipMemcpy3D(p, nullptr)); + HIP_RETURN_DURATION(hipMemcpy3D_common(p)); +} + +hipError_t hipMemcpy3D_spt(const hipMemcpy3DParms* p) { + HIP_INIT_API(hipMemcpy3D, p); + HIP_RETURN_DURATION(hipMemcpy3D_common(p, getPerThreadDefaultStream())); } hipError_t hipMemcpy3DAsync(const hipMemcpy3DParms* p, hipStream_t stream) { @@ -2357,10 +2439,19 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt return hip_error; } +hipError_t hipMemset_common(void* dst, int value, size_t sizeBytes, hipStream_t stream=nullptr) { + CHECK_STREAM_CAPTURING(); + return ihipMemset(dst, value, sizeof(int8_t), sizeBytes, stream); +} + +hipError_t hipMemset_spt(void* dst, int value, size_t sizeBytes) { + HIP_INIT_API(hipMemset, dst, value, sizeBytes); + HIP_RETURN(hipMemset_common(dst, value, sizeBytes, getPerThreadDefaultStream())); +} + hipError_t hipMemset(void* dst, int value, size_t sizeBytes) { HIP_INIT_API(hipMemset, dst, value, sizeBytes); - CHECK_STREAM_CAPTURING(); - HIP_RETURN(ihipMemset(dst, value, sizeof(int8_t), sizeBytes, nullptr)); + HIP_RETURN(hipMemset_common(dst, value, sizeBytes)); } hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream) { @@ -2490,12 +2581,24 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent return hipSuccess; } +hipError_t hipMemset2D_common(void* dst, size_t pitch, int value, size_t width, + size_t height, hipStream_t stream=nullptr) { + CHECK_STREAM_CAPTURING(); + return ihipMemset3D({dst, pitch, width, height}, value, {width, height, 1}, stream); +} + +hipError_t hipMemset2D_spt(void* dst, size_t pitch, int value, size_t width, size_t height) { + HIP_INIT_API(hipMemset2D, dst, pitch, value, width, height); + hipStream_t stream = getPerThreadDefaultStream(); + HIP_RETURN(hipMemset2D_common(dst, pitch, value, width, height, stream)); +} + hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) { HIP_INIT_API(hipMemset2D, dst, pitch, value, width, height); - CHECK_STREAM_CAPTURING(); - HIP_RETURN(ihipMemset3D({dst, pitch, width, height}, value, {width, height, 1}, nullptr)); + HIP_RETURN(hipMemset2D_common(dst, pitch, value, width, height)); } + hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height, hipStream_t stream) { HIP_INIT_API(hipMemset2DAsync, dst, pitch, value, width, height, stream); @@ -2505,10 +2608,20 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, HIP_RETURN(ihipMemset3D({dst, pitch, width, height}, value, {width, height, 1}, stream, true)); } +hipError_t hipMemset3D_common(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent, hipStream_t stream=nullptr) { + CHECK_STREAM_CAPTURING(); + return ihipMemset3D(pitchedDevPtr, value, extent, stream); +} + hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent) { HIP_INIT_API(hipMemset3D, pitchedDevPtr, value, extent); - CHECK_STREAM_CAPTURING(); - HIP_RETURN(ihipMemset3D(pitchedDevPtr, value, extent, nullptr)); + HIP_RETURN(hipMemset3D_common(pitchedDevPtr, value, extent)); +} + +hipError_t hipMemset3D_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent) { + HIP_INIT_API(hipMemset3D, pitchedDevPtr, value, extent); + hipStream_t stream = getPerThreadDefaultStream(); + HIP_RETURN(hipMemset3D_common(pitchedDevPtr, value, extent,stream)); } hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent, hipStream_t stream) { @@ -2970,14 +3083,25 @@ hipError_t hipMemcpyArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffs HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); } -hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind) { - HIP_INIT_API(hipMemcpy2DFromArray, dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind); +hipError_t hipMemcpy2DFromArray_common(void* dst, size_t dpitch, hipArray_const_t src, + size_t wOffsetSrc, size_t hOffset, size_t width, + size_t height, hipMemcpyKind kind, hipStream_t stream=nullptr) { CHECK_STREAM_CAPTURING(); if (dpitch == 0) { HIP_RETURN(hipErrorInvalidPitchValue); } + return ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, stream); +} - HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr)); +hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpy2DFromArray, dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind); + HIP_RETURN_DURATION(hipMemcpy2DFromArray_common(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind)); +} + +hipError_t hipMemcpy2DFromArray_spt(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind) { + HIP_INIT_API(hipMemcpy2DFromArray, dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind); + hipStream_t stream = getPerThreadDefaultStream(); + HIP_RETURN_DURATION(hipMemcpy2DFromArray_common(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, stream)); } hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index 3c538420c7..1fe0fed6ff 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -486,6 +486,18 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t globalWorkSizeX, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } +extern "C" hipError_t hipLaunchKernel_common(const void *hostFunction, + dim3 gridDim, + dim3 blockDim, + void** args, + size_t sharedMemBytes, + hipStream_t stream) +{ + STREAM_CAPTURE(hipLaunchKernel, stream, hostFunction, gridDim, blockDim, args, sharedMemBytes); + return ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, + nullptr, nullptr, 0); +} + extern "C" hipError_t hipLaunchKernel(const void *hostFunction, dim3 gridDim, dim3 blockDim, @@ -493,10 +505,20 @@ extern "C" hipError_t hipLaunchKernel(const void *hostFunction, size_t sharedMemBytes, hipStream_t stream) { - HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream); - STREAM_CAPTURE(hipLaunchKernel, stream, hostFunction, gridDim, blockDim, args, sharedMemBytes); - HIP_RETURN(ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, - nullptr, nullptr, 0)); + HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream); + HIP_RETURN(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream)); +} + +extern "C" hipError_t hipLaunchKernel_spt(const void *hostFunction, + dim3 gridDim, + dim3 blockDim, + void** args, + size_t sharedMemBytes, + hipStream_t stream) +{ + HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream)); } extern "C" hipError_t hipExtLaunchKernel(const void* hostFunction, @@ -513,13 +535,10 @@ extern "C" hipError_t hipExtLaunchKernel(const void* hostFunction, HIP_RETURN(ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, startEvent, stopEvent, flags)); } -hipError_t hipLaunchCooperativeKernel(const void* f, +hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 blockDim, void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream) { - HIP_INIT_API(hipLaunchCooperativeKernel, f, gridDim, blockDim, - sharedMemBytes, hStream); - if (!hip::isValid(hStream)) { HIP_RETURN(hipErrorInvalidValue); } @@ -533,14 +552,33 @@ hipError_t hipLaunchCooperativeKernel(const void* f, if (globalWorkSizeX > std::numeric_limits::max() || globalWorkSizeY > std::numeric_limits::max() || globalWorkSizeZ > std::numeric_limits::max()) { - HIP_RETURN(hipErrorInvalidConfiguration); + return hipErrorInvalidConfiguration; } - HIP_RETURN(ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), + return ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), static_cast(globalWorkSizeY), static_cast(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0, - amd::NDRangeKernelCommand::CooperativeGroups)); + amd::NDRangeKernelCommand::CooperativeGroups); +} + +hipError_t hipLaunchCooperativeKernel(const void* f, + dim3 gridDim, dim3 blockDim, + void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream) +{ + HIP_INIT_API(hipLaunchCooperativeKernel, f, gridDim, blockDim, + sharedMemBytes, hStream); + HIP_RETURN(hipLaunchCooperativeKernel_common(f,gridDim, blockDim, kernelParams, sharedMemBytes, hStream)); +} + +hipError_t hipLaunchCooperativeKernel_spt(const void* f, + dim3 gridDim, dim3 blockDim, + void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream) +{ + HIP_INIT_API(hipLaunchCooperativeKernel, f, gridDim, blockDim, + sharedMemBytes, hStream); + PER_THREAD_DEFAULT_STREAM(hStream); + HIP_RETURN(hipLaunchCooperativeKernel_common(f, gridDim, blockDim, kernelParams, sharedMemBytes, hStream)); } hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index 90775f5b97..f7b696508b 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -294,26 +294,45 @@ static hipError_t ihipStreamCreate(hipStream_t* stream, // ================================================================================================ class stream_per_thread { private: - hipStream_t m_stream; + std::vector m_streams; public: - stream_per_thread():m_stream(nullptr) {} + stream_per_thread() { + m_streams.resize(g_devices.size()); + for (auto &stream : m_streams) { + stream = nullptr; + } + } stream_per_thread(const stream_per_thread& ) = delete; void operator=(const stream_per_thread& ) = delete; ~stream_per_thread() { - if (m_stream != nullptr && hip::isValid(m_stream)) { - delete reinterpret_cast(m_stream); - m_stream = nullptr; + for (auto &stream:m_streams) { + if (stream != nullptr && hip::isValid(stream)) { + delete reinterpret_cast(stream); + stream = nullptr; + } } } - hipStream_t& get() { + hipStream_t get() { + hip::Device* device = hip::getCurrentDevice(); + int currDev = device->deviceId(); + // This is to make sure m_streams is not empty + if (m_streams.empty()) { + m_streams.resize(g_devices.size()); + for (auto &stream : m_streams) { + stream = nullptr; + } + } // There is a scenario where hipResetDevice destroys stream per thread // hence isValid check is required to make sure only valid stream is used - if (m_stream == nullptr || !hip::isValid(m_stream)) { - hipError_t err = ihipStreamCreate(&m_stream, hipStreamDefault, hip::Stream::Priority::Normal); - assert(err == hipSuccess); + if (m_streams[currDev] == nullptr || !hip::isValid(m_streams[currDev])) { + hipError_t status = ihipStreamCreate(&m_streams[currDev], hipStreamDefault, + hip::Stream::Priority::Normal); + if (status != hipSuccess) { + DevLogError("Stream creation failed\n"); + } } - return m_stream; + return m_streams[currDev]; } }; thread_local stream_per_thread streamPerThreadObj; @@ -325,6 +344,15 @@ void getStreamPerThread(hipStream_t& stream) { } } +// ================================================================================================ +hipStream_t getPerThreadDefaultStream() { + // Function to get per thread default stream + // More about the usecases yet to come + hipStream_t stream = hipStreamPerThread; + getStreamPerThread(stream); + return stream; +} + // ================================================================================================ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) { HIP_INIT_API(hipStreamCreateWithFlags, stream, flags); @@ -381,44 +409,53 @@ hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPrio } // ================================================================================================ -hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags) { - HIP_INIT_API(hipStreamGetFlags, stream, flags); - - if (flags != nullptr) { - if (stream == nullptr) { - // hipStreamDefault - *flags = 0; - } else { - if (!hip::isValid(stream)) { - HIP_RETURN(hipErrorContextIsDestroyed); - } - *flags = reinterpret_cast(stream)->Flags(); +hipError_t hipStreamGetFlags_common(hipStream_t stream, unsigned int* flags) { + if ((flags != nullptr) && (stream != nullptr)) { + if (!hip::isValid(stream)) { + return hipErrorContextIsDestroyed; } + *flags = reinterpret_cast(stream)->Flags(); } else { - HIP_RETURN(hipErrorInvalidValue); + return hipErrorInvalidValue; } - HIP_RETURN(hipSuccess); + return hipSuccess; +} + +// ================================================================================================ +hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags) { + HIP_INIT_API(hipStreamGetFlags, stream, flags); + HIP_RETURN(hipStreamGetFlags_common(stream, flags)); +} + +// ================================================================================================ +hipError_t hipStreamGetFlags_spt(hipStream_t stream, unsigned int* flags) { + HIP_INIT_API(hipStreamGetFlags, stream, flags); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN(hipStreamGetFlags_common(stream, flags)); +} + +// ================================================================================================ +hipError_t hipStreamSynchronize_common(hipStream_t stream) { + if (!hip::isValid(stream)) { + HIP_RETURN(hipErrorContextIsDestroyed); + } + // Wait for the current host queue + hip::getQueue(stream)->finish(); + return hipSuccess; } // ================================================================================================ hipError_t hipStreamSynchronize(hipStream_t stream) { HIP_INIT_API(hipStreamSynchronize, stream); + HIP_RETURN(hipStreamSynchronize_common(stream)); +} - if (!hip::isValid(stream)) { - HIP_RETURN(hipErrorContextIsDestroyed); - } - - // Wait for the current host queue - hip::getQueue(stream)->finish(); - - // Make sure runtime releases memory for all memory pools on the device, - // associated with the queue - auto hip_stream = reinterpret_cast(stream); - auto device = (hip_stream == nullptr) ? hip::getCurrentDevice() : hip_stream->GetDevice(); - device->ReleaseFreedMemory(hip_stream); - - HIP_RETURN(hipSuccess); +// ================================================================================================ +hipError_t hipStreamSynchronize_spt(hipStream_t stream) { + HIP_INIT_API(hipStreamSynchronize, stream); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN(hipStreamSynchronize_common(stream)); } // ================================================================================================ @@ -463,33 +500,42 @@ void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* user_d } // ================================================================================================ -hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags) { - HIP_INIT_API(hipStreamWaitEvent, stream, event, flags); - +hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsigned int flags) { EVENT_CAPTURE(hipStreamWaitEvent, event, stream, flags); if (event == nullptr) { - HIP_RETURN(hipErrorInvalidHandle); + return hipErrorInvalidHandle; } if (flags != 0) { - HIP_RETURN(hipErrorInvalidValue); + return hipErrorInvalidValue; } if (!hip::isValid(stream)) { - HIP_RETURN(hipErrorContextIsDestroyed); + return hipErrorContextIsDestroyed; } hip::Event* e = reinterpret_cast(event); - HIP_RETURN(e->streamWait(stream, flags)); + return e->streamWait(stream, flags); } // ================================================================================================ -hipError_t hipStreamQuery(hipStream_t stream) { - HIP_INIT_API(hipStreamQuery, stream); +hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags) { + HIP_INIT_API(hipStreamWaitEvent, stream, event, flags); + HIP_RETURN(hipStreamWaitEvent_common(stream, event, flags)); +} +// ================================================================================================ +hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event, unsigned int flags) { + HIP_INIT_API(hipStreamWaitEvent, stream, event, flags); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN(hipStreamWaitEvent_common(stream, event, flags)); +} + +// ================================================================================================ +hipError_t hipStreamQuery_common(hipStream_t stream) { if (!hip::isValid(stream)) { - HIP_RETURN(hipErrorContextIsDestroyed); + return hipErrorContextIsDestroyed; } amd::HostQueue* hostQueue = hip::getQueue(stream); @@ -497,7 +543,7 @@ hipError_t hipStreamQuery(hipStream_t stream) { amd::Command* command = hostQueue->getLastQueuedCommand(true); if (command == nullptr) { // Nothing was submitted to the queue - HIP_RETURN(hipSuccess); + return hipSuccess; } amd::Event& event = command->event(); @@ -511,7 +557,20 @@ hipError_t hipStreamQuery(hipStream_t stream) { } hipError_t status = ready ? hipSuccess : hipErrorNotReady; command->release(); - HIP_RETURN(status); + return status; +} + +// ================================================================================================ +hipError_t hipStreamQuery(hipStream_t stream) { + HIP_INIT_API(hipStreamQuery, stream); + HIP_RETURN(hipStreamQuery_common(stream)); +} + +// ================================================================================================ +hipError_t hipStreamQuery_spt(hipStream_t stream) { + HIP_INIT_API(hipStreamQuery, stream); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN(hipStreamQuery_common(stream)); } // ================================================================================================ @@ -586,24 +645,35 @@ hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize } // ================================================================================================ -hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) { - HIP_INIT_API(hipStreamGetPriority, stream, priority); +hipError_t hipStreamGetPriority_common(hipStream_t stream, int* priority) { if ((priority != nullptr) && (stream == nullptr)) { *priority = 0; - HIP_RETURN(hipSuccess); + return hipSuccess; } if ((priority != nullptr) && (stream != nullptr)) { if (!hip::isValid(stream)) { - HIP_RETURN(hipErrorContextIsDestroyed); + return hipErrorContextIsDestroyed; } *priority = static_cast(reinterpret_cast(stream)->GetPriority()); } else { - HIP_RETURN(hipErrorInvalidValue); + return hipErrorInvalidValue; } - HIP_RETURN(hipSuccess); + return hipSuccess; +} +// ================================================================================================ +hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) { + HIP_INIT_API(hipStreamGetPriority, stream, priority); + HIP_RETURN(hipStreamGetPriority_common(stream, priority)); +} + +// ================================================================================================ +hipError_t hipStreamGetPriority_spt(hipStream_t stream, int* priority) { + HIP_INIT_API(hipStreamGetPriority, stream, priority); + PER_THREAD_DEFAULT_STREAM(stream); + HIP_RETURN(hipStreamGetPriority_common(stream, priority)); } // ================================================================================================