Skip to content

Commit dd8a3ba

Browse files
committed
GPU: Add optional cpu-time measurement per recoStep
1 parent 1f03740 commit dd8a3ba

File tree

7 files changed

+51
-31
lines changed

7 files changed

+51
-31
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,9 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
249249
if (mProcessingSettings.debugLevel < 1) {
250250
mProcessingSettings.deviceTimers = false;
251251
}
252+
if (mProcessingSettings.debugLevel > 0) {
253+
mProcessingSettings.recoTaskTiming = true;
254+
}
252255
if (mProcessingSettings.deterministicGPUReconstruction == -1) {
253256
mProcessingSettings.deterministicGPUReconstruction = mProcessingSettings.debugLevel >= 6;
254257
}

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ class GPUReconstruction
332332
uint32_t mNEventsProcessed = 0;
333333
double mStatKernelTime = 0.;
334334
double mStatWallTime = 0.;
335+
double mStatCPUTime = 0.;
335336
std::shared_ptr<GPUROOTDumpCore> mROOTDump;
336337
std::vector<std::array<uint32_t, 4>>* mOutputErrorCodes = nullptr;
337338

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "GPUConstantMem.h"
3434
#include "GPUMemorySizeScalers.h"
3535
#include <atomic>
36+
#include <ctime>
3637

3738
#define GPUCA_LOGGING_PRINTF
3839
#include "GPULogging.h"
@@ -220,7 +221,8 @@ int32_t GPUReconstructionCPU::RunChains()
220221
mStatNEvents++;
221222
mNEventsProcessed++;
222223

223-
timerTotal.Start();
224+
mTimerTotal.Start();
225+
const std::clock_t cpuTimerStart = std::clock();
224226
if (mProcessingSettings.doublePipeline) {
225227
int32_t retVal = EnqueuePipeline();
226228
if (retVal) {
@@ -237,17 +239,18 @@ int32_t GPUReconstructionCPU::RunChains()
237239
}
238240
}
239241
}
240-
timerTotal.Stop();
242+
mTimerTotal.Stop();
243+
mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
241244

242-
mStatWallTime = (timerTotal.GetElapsedTime() * 1000000. / mStatNEvents);
245+
mStatWallTime = (mTimerTotal.GetElapsedTime() * 1000000. / mStatNEvents);
243246
std::string nEventReport;
244247
if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) {
245248
nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)";
246249
}
247-
if (GetProcessingSettings().debugLevel >= 1) {
248-
double kernelTotal = 0;
249-
std::vector<double> kernelStepTimes(GPUDataTypes::N_RECO_STEPS);
250+
double kernelTotal = 0;
251+
std::vector<double> kernelStepTimes(GPUDataTypes::N_RECO_STEPS, 0.);
250252

253+
if (GetProcessingSettings().debugLevel >= 1) {
251254
for (uint32_t i = 0; i < mTimers.size(); i++) {
252255
double time = 0;
253256
if (mTimers[i] == nullptr) {
@@ -277,9 +280,12 @@ int32_t GPUReconstructionCPU::RunChains()
277280
mTimers[i]->memSize = 0;
278281
}
279282
}
283+
}
284+
if (GetProcessingSettings().recoTaskTiming) {
280285
for (int32_t i = 0; i < GPUDataTypes::N_RECO_STEPS; i++) {
281286
if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
282-
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us)\n", "Tasks", GPUDataTypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents);
287+
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
288+
GPUDataTypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
283289
}
284290
if (mTimersRecoSteps[i].bytesToGPU) {
285291
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
@@ -294,6 +300,7 @@ int32_t GPUReconstructionCPU::RunChains()
294300
mTimersRecoSteps[i].timerToGPU.Reset();
295301
mTimersRecoSteps[i].timerToHost.Reset();
296302
mTimersRecoSteps[i].timerTotal.Reset();
303+
mTimersRecoSteps[i].timerCPU = 0;
297304
mTimersRecoSteps[i].countToGPU = 0;
298305
mTimersRecoSteps[i].countToHost = 0;
299306
}
@@ -303,15 +310,18 @@ int32_t GPUReconstructionCPU::RunChains()
303310
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", GPUDataTypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
304311
}
305312
}
306-
mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
307-
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
308-
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Wall", mStatWallTime, nEventReport.c_str());
313+
if (GetProcessingSettings().debugLevel >= 1) {
314+
mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
315+
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
316+
}
317+
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
309318
} else if (GetProcessingSettings().debugLevel >= 0) {
310-
GPUInfo("Total Wall Time: %lu us%s", (uint64_t)mStatWallTime, nEventReport.c_str());
319+
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
311320
}
312321
if (mProcessingSettings.resetTimers) {
313322
mStatNEvents = 0;
314-
timerTotal.Reset();
323+
mStatCPUTime = 0;
324+
mTimerTotal.Reset();
315325
}
316326

317327
return 0;

GPU/GPUTracking/Base/GPUReconstructionProcessing.h

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,25 @@ class GPUReconstructionProcessing : public GPUReconstruction
7777
constexpr static const char* GetKernelName();
7878

7979
// Public queries for timers
80-
HighResTimer& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)].timerTotal; }
80+
auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; }
8181
HighResTimer& getGeneralStepTimer(GeneralStep step) { return mTimersGeneralSteps[getGeneralStepNum(step)]; }
8282

8383
template <class T>
8484
void AddGPUEvents(T*& events);
8585

8686
virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
8787

88+
struct RecoStepTimerMeta {
89+
HighResTimer timerToGPU;
90+
HighResTimer timerToHost;
91+
HighResTimer timerTotal;
92+
double timerCPU = 0.;
93+
size_t bytesToGPU = 0;
94+
size_t bytesToHost = 0;
95+
uint32_t countToGPU = 0;
96+
uint32_t countToHost = 0;
97+
};
98+
8899
protected:
89100
GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {}
90101
using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
@@ -105,21 +116,11 @@ class GPUReconstructionProcessing : public GPUReconstruction
105116
size_t memSize; // Memory size for memory bandwidth computation
106117
};
107118

108-
struct RecoStepTimerMeta {
109-
HighResTimer timerToGPU;
110-
HighResTimer timerToHost;
111-
HighResTimer timerTotal;
112-
size_t bytesToGPU = 0;
113-
size_t bytesToHost = 0;
114-
uint32_t countToGPU = 0;
115-
uint32_t countToHost = 0;
116-
};
117-
118119
HighResTimer mTimersGeneralSteps[GPUDataTypes::N_GENERAL_STEPS];
119120

120121
std::vector<std::unique_ptr<timerMeta>> mTimers;
121122
RecoStepTimerMeta mTimersRecoSteps[GPUDataTypes::N_RECO_STEPS];
122-
HighResTimer timerTotal;
123+
HighResTimer mTimerTotal;
123124
template <class T, int32_t I = 0>
124125
HighResTimer& getKernelTimer(RecoStep step, int32_t num = 0, size_t addMemorySize = 0, bool increment = true);
125126
template <class T, int32_t J = -1>

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent,
234234
AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
235235
AddOption(debugMask, int32_t, 262143, "", 0, "Mask for debug output dumps to file")
236236
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
237+
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
237238
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6")
238239
AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics")
239240
AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression")

GPU/GPUTracking/Global/GPUChain.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
#include "GPUReconstructionCPU.h"
1919

20+
#include <ctime>
21+
2022
namespace o2::gpu
2123
{
2224
class GPUChain
@@ -280,16 +282,20 @@ template <class T, class S, typename... Args>
280282
int32_t GPUChain::runRecoStep(RecoStep step, S T::*func, Args... args)
281283
{
282284
if (GetRecoSteps().isSet(step)) {
283-
if (GetProcessingSettings().debugLevel >= 1) {
284-
mRec->getRecoStepTimer(step).Start();
285+
auto* timer = GetProcessingSettings().recoTaskTiming ? &mRec->getRecoStepTimer(step) : nullptr;
286+
std::clock_t c;
287+
if (timer) {
288+
timer->timerTotal.Start();
289+
c = std::clock();
285290
}
286291
int32_t retVal = (reinterpret_cast<T*>(this)->*func)(args...);
287-
if (GetProcessingSettings().debugLevel >= 1) {
288-
mRec->getRecoStepTimer(step).Stop();
292+
if (timer) {
293+
timer->timerTotal.Stop();
294+
timer->timerCPU += (double)(std::clock() - c) / CLOCKS_PER_SEC;
289295
}
290296
return retVal;
291297
}
292-
return false;
298+
return 0;
293299
}
294300

295301
} // namespace o2::gpu

GPU/GPUTracking/utils/timer.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,7 @@ class HighResTimer
4040

4141
static double GetFrequency();
4242
static double GetTime();
43-
#ifndef GPUCODE
4443
static double Frequency;
45-
#endif
4644
};
4745

4846
#endif

0 commit comments

Comments
 (0)