Skip to content

Commit ab41f53

Browse files
committed
GPU: Remove non-working MI100 serialization workaround and obsolete StuckProtection
1 parent 1c7cd94 commit ab41f53

File tree

7 files changed

+0
-33
lines changed

7 files changed

+0
-33
lines changed

GPU/GPUTracking/Base/GPUReconstructionCPU.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
8888
int32_t ExitDevice() override;
8989
int32_t GetThread();
9090

91-
virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
92-
9391
// Pointers to tracker classes
9492
GPUProcessorProcessors mProcShadow; // Host copy of tracker objects that will be used on the GPU
9593
GPUConstantMem*& mProcessorsShadow = mProcShadow.mProcessorsProc;

GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -470,27 +470,6 @@ void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEve
470470

471471
void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }
472472

473-
int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event)
474-
{
475-
if (GetProcessingSettings().stuckProtection) {
476-
cl_int tmp = 0;
477-
for (int32_t i = 0; i <= GetProcessingSettings().stuckProtection / 50; i++) {
478-
usleep(50);
479-
clGetEventInfo(event.get<cl_event>(), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(tmp), &tmp, nullptr);
480-
if (tmp == CL_COMPLETE) {
481-
break;
482-
}
483-
}
484-
if (tmp != CL_COMPLETE) {
485-
mGPUStuck = 1;
486-
GPUErrorReturn("GPU Stuck, future processing in this component is disabled, skipping event (GPU Event State %d)", (int32_t)tmp);
487-
}
488-
} else {
489-
clFinish(mInternals->command_queue[stream]);
490-
}
491-
return 0;
492-
}
493-
494473
void GPUReconstructionOCL::SynchronizeGPU()
495474
{
496475
for (int32_t i = 0; i < mNStreams; i++) {

GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
4343
virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;
4444

4545
void SynchronizeGPU() override;
46-
int32_t DoStuckProtection(int32_t stream, deviceEvent event) override;
4746
int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override;
4847
void SynchronizeStream(int32_t stream) override;
4948
void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) override;

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,6 @@ BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Proces
301301
AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
302302
AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")
303303
AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances")
304-
AddOption(stuckProtection, int32_t, 0, "", 0, "Timeout in us, When AMD GPU is stuck, just continue processing and skip tracking, do not crash or stall the chain")
305304
AddOption(trdNCandidates, int32_t, 3, "", 0, "Number of branching track candidates for single input track during propagation")
306305
AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GPU track model for TRD tracking")
307306
AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))")
@@ -383,7 +382,6 @@ AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run th
383382
AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder")
384383
AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB")
385384
AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump")
386-
AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug")
387385
AddOption(memoryStat, bool, false, "", 0, "Print memory statistics")
388386
AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
389387
AddSubConfig(GPUSettingsProcessingRTC, rtc)

GPU/GPUTracking/Global/GPUChain.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,6 @@ class GPUChain
224224

225225
inline GPUChain* GetNextChainInQueue() { return mRec->GetNextChainInQueue(); }
226226

227-
virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
228-
229227
template <class T, class S, typename... Args>
230228
bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::*func, Args&&... args)
231229
{

GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,6 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
149149
GPUTPCTracker& trk = processors()->tpcTrackers[iSector];
150150
GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSector] : trk;
151151
int32_t useStream = StreamForSector(iSector);
152-
if (GetProcessingSettings().amdMI100SerializationWorkaround) {
153-
SynchronizeStream(useStream); // TODO: Remove this workaround once fixed on MI100
154-
}
155152

156153
if (GetProcessingSettings().debugLevel >= 3) {
157154
GPUInfo("Creating Sector Data (Sector %d)", iSector);

prodtests/full-system-test/dpl-workflow.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,8 +284,6 @@ if [[ $GPUTYPE == "HIP" ]]; then
284284
if [[ ${EPN_NODE_MI100:-} == "1" && ${DISABLE_MI100_SERIALIZATION:-0} != 1 ]]; then
285285
if [[ -n ${OPTIMIZED_PARALLEL_ASYNC:-} ]] || [[ $EPNSYNCMODE == 1 && ${FULL_MI100_SERIALIZATION:-0} == 1 ]]; then
286286
GPU_CONFIG_KEY+="GPU_proc.serializeGPU=3;"
287-
elif [[ $EPNSYNCMODE == 1 ]]; then
288-
GPU_CONFIG_KEY+="GPU_proc.amdMI100SerializationWorkaround=1;"
289287
fi
290288
fi
291289
#export HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2

0 commit comments

Comments
 (0)