Skip to content

Commit 6e2f625

Browse files
Write standalone --debug output to CSV file
1 parent 3d17e85 commit 6e2f625

File tree

5 files changed

+140
-19
lines changed

5 files changed

+140
-19
lines changed

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
#include <atomic>
3737
#include <ctime>
38+
#include <string>
3839

3940
#ifndef _WIN32
4041
#include <unistd.h>
@@ -230,6 +231,7 @@ int32_t GPUReconstructionCPU::RunChains()
230231
GPUInfo("Allocated memory when starting processing %34s", "");
231232
PrintMemoryOverview();
232233
}
234+
233235
mTimerTotal.Start();
234236
const std::clock_t cpuTimerStart = std::clock();
235237
int32_t retVal = 0;
@@ -264,7 +266,10 @@ int32_t GPUReconstructionCPU::RunChains()
264266
double kernelTotal = 0;
265267
std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);
266268

269+
debugWriter writer(GetProcessingSettings().debugCSV, GetProcessingSettings().debugMarkdown, mStatNEvents);
270+
267271
if (GetProcessingSettings().debugLevel >= 1) {
272+
writer.header();
268273
for (uint32_t i = 0; i < mTimers.size(); i++) {
269274
double time = 0;
270275
if (mTimers[i] == nullptr) {
@@ -284,11 +289,7 @@ int32_t GPUReconstructionCPU::RunChains()
284289
int32_t stepNum = getRecoStepNum(mTimers[i]->step);
285290
kernelStepTimes[stepNum] += time;
286291
}
287-
char bandwidth[256] = "";
288-
if (mTimers[i]->memSize && mStatNEvents && time != 0.) {
289-
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
290-
}
291-
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
292+
writer.row('K', mTimers[i]->count, mTimers[i]->name.c_str(), time, -1.0, -1.0, mTimers[i]->memSize);
292293
if (GetProcessingSettings().resetTimers) {
293294
mTimers[i]->count = 0;
294295
mTimers[i]->memSize = 0;
@@ -298,16 +299,13 @@ int32_t GPUReconstructionCPU::RunChains()
298299
if (GetProcessingSettings().recoTaskTiming) {
299300
for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) {
300301
if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
301-
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
302-
gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
302+
writer.row(' ', 0, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)", kernelStepTimes[i], mTimersRecoSteps[i].timerCPU, mTimersRecoSteps[i].timerTotal.GetElapsedTime(), 0);
303303
}
304304
if (mTimersRecoSteps[i].bytesToGPU) {
305-
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
306-
mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU);
305+
writer.row('D', mTimersRecoSteps[i].countToGPU, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)", mTimersRecoSteps[i].timerToGPU.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToGPU);
307306
}
308307
if (mTimersRecoSteps[i].bytesToHost) {
309-
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
310-
mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
308+
writer.row('D', mTimersRecoSteps[i].countToHost, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)", mTimersRecoSteps[i].timerToHost.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToHost);
311309
}
312310
if (GetProcessingSettings().resetTimers) {
313311
mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0;
@@ -321,14 +319,11 @@ int32_t GPUReconstructionCPU::RunChains()
321319
}
322320
for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) {
323321
if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
324-
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
322+
writer.row(' ', 0, gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime(), -1.0, -1.0, 0);
325323
}
326324
}
327-
if (GetProcessingSettings().debugLevel >= 1) {
328-
mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
329-
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
330-
}
331-
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
325+
double gpu_time = GetProcessingSettings().debugLevel >= 1 ? kernelTotal : -1.0;
326+
writer.row(' ', 0, "Wall", gpu_time, mStatCPUTime, mTimerTotal.GetElapsedTime(), 0, nEventReport);
332327
} else if (GetProcessingSettings().debugLevel >= 0) {
333328
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
334329
}

GPU/GPUTracking/Base/GPUReconstructionCPU.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
#define GPURECONSTRUCTIONICPU_H
1717

1818
#include "GPUReconstructionProcessing.h"
19+
#include <fstream>
1920
#include <stdexcept>
21+
#include <string>
2022
#include <vector>
2123

2224
namespace Ort
@@ -100,6 +102,17 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
100102
size_t TransferMemoryResourcesHelper(GPUProcessor* proc, int32_t stream, bool all, bool toGPU);
101103
template <class S, int32_t I = 0, typename... Args>
102104
void runKernelInterface(krnlSetup&& setup, Args const&... args);
105+
106+
struct debugWriter {
107+
debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents);
108+
void header();
109+
void row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport = "");
110+
111+
private:
112+
std::ofstream streamCSV;
113+
bool mMarkdown;
114+
uint32_t mStatNEvents;
115+
};
103116
};
104117

105118
} // namespace o2::gpu

GPU/GPUTracking/Base/GPUReconstructionDebug.cxx

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
/// \author David Rohr
1414

1515
#include "GPUReconstruction.h"
16+
#include "GPUReconstructionCPU.h"
1617
#include "GPULogging.h"
1718
#include "GPUSettings.h"
1819

@@ -23,6 +24,8 @@
2324
#include <filesystem>
2425
#include <chrono>
2526
#include <format>
27+
#include <iostream>
28+
#include <string>
2629

2730
using namespace o2::gpu;
2831

@@ -186,3 +189,107 @@ bool GPUReconstruction::triggerDebugDump()
186189
}
187190
return false;
188191
}
192+
193+
GPUReconstructionCPU::debugWriter::debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents) : mMarkdown{markdown}, mStatNEvents{statNEvents}
194+
{
195+
if (!filenameCSV.empty()) {
196+
streamCSV.open(filenameCSV, std::ios::out | std::ios::app);
197+
}
198+
}
199+
200+
void GPUReconstructionCPU::debugWriter::header()
201+
{
202+
if (streamCSV.is_open() && !streamCSV.tellp()) {
203+
streamCSV << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n";
204+
}
205+
206+
if (mMarkdown) {
207+
std::cout << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n";
208+
std::cout << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n";
209+
}
210+
}
211+
212+
void GPUReconstructionCPU::debugWriter::row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport)
213+
{
214+
double scale = 1000000.0 / mStatNEvents;
215+
216+
if (streamCSV.is_open()) {
217+
streamCSV << type << ",";
218+
if (count != 0)
219+
streamCSV << count;
220+
streamCSV << "," << name << ",";
221+
if (gpu_time != -1.0)
222+
streamCSV << std::format("{:.0f}", gpu_time * scale);
223+
streamCSV << ",";
224+
if (cpu_time != -1.0)
225+
streamCSV << std::format("{:.0f}", cpu_time * scale);
226+
streamCSV << ",";
227+
if (cpu_time != -1.0 && total_time != -1.0)
228+
streamCSV << std::format("{:.2f}", cpu_time / total_time);
229+
streamCSV << ",";
230+
if (total_time != -1.0)
231+
streamCSV << std::format("{:.0f}", total_time * scale);
232+
streamCSV << ",";
233+
if (memSize != 0 && count != 0)
234+
streamCSV << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
235+
else
236+
streamCSV << ",,";
237+
streamCSV << std::endl;
238+
}
239+
240+
if (mMarkdown) {
241+
std::cout << "| " << type << " | ";
242+
if (count != 0)
243+
std::cout << std::format("{:6} |", count);
244+
else
245+
std::cout << " |";
246+
std::cout << std::format(" {:42}|", name);
247+
if (gpu_time != -1.0)
248+
std::cout << std::format("{:10.0f} |", gpu_time * scale);
249+
else
250+
std::cout << " |";
251+
if (cpu_time != -1.0)
252+
std::cout << std::format("{:10.0f} |", cpu_time * scale);
253+
else
254+
std::cout << " |";
255+
if (cpu_time != -1.0 && total_time != -1.0)
256+
std::cout << std::format("{:8.2f} |", cpu_time / total_time);
257+
else
258+
std::cout << " |";
259+
if (total_time != -1.0)
260+
std::cout << std::format("{:10.0f} |", total_time * scale);
261+
else
262+
std::cout << " |";
263+
if (memSize != 0 && count != 0)
264+
std::cout << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
265+
else
266+
std::cout << " | | |";
267+
std::cout << std::endl;
268+
} else {
269+
if (name.substr(0, 3) == "GPU") {
270+
char bandwidth[256] = "";
271+
if (memSize && mStatNEvents && gpu_time != 0.0) {
272+
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
273+
}
274+
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type, count, name.c_str(), gpu_time * scale, bandwidth);
275+
} else if (name.substr(0, 3) == "TPC") {
276+
std::size_t n = name.find('(');
277+
std::string basename = name.substr(0, n - 1);
278+
std::string postfix = name.substr(n + 1, name.size() - n - 2);
279+
if (total_time != -1.0) {
280+
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", postfix.c_str(),
281+
basename.c_str(), gpu_time * scale, "", total_time * scale, cpu_time * scale, cpu_time / total_time);
282+
} else {
283+
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", count, postfix.c_str(), basename.c_str(), gpu_time * scale,
284+
memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
285+
}
286+
} else if (name == "Prepare") {
287+
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", name.c_str(), gpu_time * scale);
288+
} else if (name == "Wall") {
289+
if (gpu_time != -1.0) {
290+
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", gpu_time * scale, nEventReport.c_str());
291+
}
292+
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", total_time * scale, cpu_time * scale, cpu_time / total_time, nEventReport.c_str());
293+
}
294+
}
295+
}

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent,
329329
AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
330330
AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file")
331331
AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
332+
AddOption(debugCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.")
333+
AddOption(debugMarkdown, bool, false, "", 0, "Print the results of standlaone benchmarks in markdown format")
332334
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
333335
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
334336
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1))

GPU/GPUTracking/Standalone/Benchmark/standalone.cxx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,9 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
627627
if (configStandalone.runs > 1) {
628628
printf("Run %d (thread %d)\n", iteration + 1, threadId);
629629
}
630-
recUse->SetResetTimers(iRun < configStandalone.runsInit);
630+
if (configStandalone.runsInit > 0 && configStandalone.proc.debugCSV.empty()) {
631+
recUse->SetResetTimers(iRun < configStandalone.runsInit);
632+
}
631633
if (configStandalone.outputcontrolmem) {
632634
recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem);
633635
}
@@ -685,7 +687,9 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
685687
chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;
686688
}
687689
chainTrackingAsync->mIOPtrs.clustersNative = nullptr;
688-
recAsync->SetResetTimers(iRun < configStandalone.runsInit);
690+
if (configStandalone.runsInit > 0 && configStandalone.proc.debugCSV.empty()) {
691+
recAsync->SetResetTimers(iRun < configStandalone.runsInit);
692+
}
689693
tmpRetVal = recAsync->RunChains();
690694
if (tmpRetVal == 0 || tmpRetVal == 2) {
691695
OutputStat(chainTrackingAsync, nullptr, nullptr);

0 commit comments

Comments
 (0)