Skip to content

Commit 98c6c83

Browse files
committed
Recover Utils.h
1 parent 3cbb2f9 commit 98c6c83

File tree

1 file changed

+265
-0
lines changed

1 file changed

+265
-0
lines changed

GPU/GPUbenchmark/Shared/Utils.h

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+
// All rights not expressly granted are reserved.
4+
//
5+
// This software is distributed under the terms of the GNU General Public
6+
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+
//
8+
// In applying this license CERN does not waive the privileges and immunities
9+
// granted to it by virtue of its status as an Intergovernmental Organization
10+
// or submit itself to any jurisdiction.
11+
///
12+
/// \file Common.h
13+
/// \author: mconcas@cern.ch
14+
15+
#ifndef GPU_BENCHMARK_UTILS_H
16+
#define GPU_BENCHMARK_UTILS_H
17+
18+
#if defined(__HIPCC__)
19+
#include "hip/hip_runtime.h"
20+
#endif
21+
22+
#include <iostream>
23+
#include <sstream>
24+
#include <iomanip>
25+
#include <typeinfo>
26+
#include <boost/program_options.hpp>
27+
#include <vector>
28+
#include <string>
29+
#include <cmath>
30+
31+
#define KNRM "\x1B[0m"
32+
#define KRED "\x1B[31m"
33+
#define KGRN "\x1B[32m"
34+
#define KYEL "\x1B[33m"
35+
#define configLU "\x1B[34m"
36+
#define KMAG "\x1B[35m"
37+
#define KCYN "\x1B[36m"
38+
#define KWHT "\x1B[37m"
39+
40+
#define GB (1024 * 1024 * 1024)
41+
42+
#define failed(...) \
43+
printf("%serror: ", KRED); \
44+
printf(__VA_ARGS__); \
45+
printf("\n"); \
46+
printf("error: TEST FAILED\n%s", KNRM); \
47+
exit(EXIT_FAILURE);
48+
#endif
49+
50+
template <typename T>
51+
void discardResult(const T&)
52+
{
53+
}
54+
55+
enum class Test {
56+
Read,
57+
Write,
58+
Copy,
59+
RandomRead,
60+
RandomWrite,
61+
RandomCopy
62+
};
63+
64+
inline std::ostream& operator<<(std::ostream& os, Test test)
65+
{
66+
switch (test) {
67+
case Test::Read:
68+
os << "read";
69+
break;
70+
case Test::Write:
71+
os << "write";
72+
break;
73+
case Test::Copy:
74+
os << "copy";
75+
break;
76+
case Test::RandomRead:
77+
os << "random read";
78+
break;
79+
case Test::RandomWrite:
80+
os << "random write";
81+
break;
82+
case Test::RandomCopy:
83+
os << "random copy";
84+
break;
85+
}
86+
return os;
87+
}
88+
89+
enum class Mode {
90+
Sequential,
91+
Concurrent,
92+
Distributed
93+
};
94+
95+
inline std::ostream& operator<<(std::ostream& os, Mode mode)
96+
{
97+
switch (mode) {
98+
case Mode::Sequential:
99+
os << "sequential";
100+
break;
101+
case Mode::Concurrent:
102+
os << "concurrent";
103+
break;
104+
case Mode::Distributed:
105+
os << "distributed";
106+
break;
107+
}
108+
return os;
109+
}
110+
111+
enum class KernelConfig {
112+
Single,
113+
Multi,
114+
All,
115+
Manual
116+
};
117+
118+
inline std::ostream& operator<<(std::ostream& os, KernelConfig config)
119+
{
120+
switch (config) {
121+
case KernelConfig::Single:
122+
os << "single";
123+
break;
124+
case KernelConfig::Multi:
125+
os << "multiple";
126+
break;
127+
case KernelConfig::All:
128+
os << "all";
129+
break;
130+
case KernelConfig::Manual:
131+
os << "manual";
132+
break;
133+
}
134+
return os;
135+
}
136+
137+
template <class T>
138+
inline std::string getType()
139+
{
140+
if (typeid(T).name() == typeid(int8_t).name()) {
141+
return std::string{"int8_t"};
142+
}
143+
if (typeid(T).name() == typeid(size_t).name()) {
144+
return std::string{"uint64_t"};
145+
}
146+
if (typeid(T).name() == typeid(int32_t).name()) {
147+
return std::string{"int32_t"};
148+
}
149+
if (typeid(T).name() == typeid(int4).name()) {
150+
return std::string{"int4"};
151+
}
152+
return std::string{"unknown"};
153+
}
154+
155+
inline std::string getTestName(Mode mode, Test test, KernelConfig blocks)
156+
{
157+
std::string tname;
158+
tname += (mode == Mode::Sequential) ? "seq_" : "conc_";
159+
tname += (test == Test::Read) ? "read_" : (test == Test::Write) ? "write_"
160+
: "copy_";
161+
tname += (blocks == KernelConfig::Single) ? "SB" : "MB";
162+
return tname;
163+
}
164+
165+
// Return pointer to custom offset (GB)
166+
template <class chunk_t>
167+
inline chunk_t* getCustomPtr(chunk_t* scratchPtr, float startGB)
168+
{
169+
return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + (static_cast<size_t>(GB * startGB) & 0xFFFFFFFFFFFFF000));
170+
}
171+
172+
inline float computeThroughput(Test test, float result, float chunkSizeGB, int32_t ntests)
173+
{
174+
// https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html
175+
// Eff_bandwidth (GB/s) = (B_r + B_w) / (~1e9 * Time (s))
176+
177+
return 1e3 * chunkSizeGB * (float)ntests / result;
178+
}
179+
180+
template <class chunk_t>
181+
inline size_t getBufferCapacity(float chunkSizeGB, int32_t prime)
182+
{
183+
auto chunkCapacity = (static_cast<size_t>(GB * chunkSizeGB) & 0xFFFFFFFFFFFFF000) / sizeof(chunk_t);
184+
if (!prime) {
185+
return chunkCapacity;
186+
} else {
187+
return (chunkCapacity % prime == 0) ? (chunkCapacity - 0x1000) : chunkCapacity;
188+
}
189+
}
190+
191+
inline bool is_prime(const int32_t n)
192+
{
193+
bool isPrime = true;
194+
if (n == 0 || n == 1) {
195+
isPrime = false;
196+
} else {
197+
for (int32_t i = 2; i <= sqrt(n); ++i) {
198+
if (n % i == 0) {
199+
isPrime = false;
200+
break;
201+
}
202+
}
203+
}
204+
205+
return isPrime;
206+
}
207+
208+
namespace o2
209+
{
210+
namespace benchmark
211+
{
212+
struct benchmarkOpts {
213+
benchmarkOpts() = default;
214+
215+
int32_t deviceId = 0;
216+
std::vector<Test> tests = {Test::Read, Test::Write, Test::Copy};
217+
std::vector<Mode> modes = {Mode::Sequential, Mode::Concurrent};
218+
std::vector<KernelConfig> pools = {KernelConfig::Single, KernelConfig::Multi};
219+
std::vector<std::string> dtypes = {"int8_t", "int32_t", "uint64_t"};
220+
std::vector<std::pair<float, float>> testChunks;
221+
float chunkReservedGB = 1.f;
222+
float threadPoolFraction = 1.f;
223+
float freeMemoryFractionToAllocate = 0.95f;
224+
int32_t numThreads = -1;
225+
int32_t numBlocks = -1;
226+
int32_t kernelLaunches = 1;
227+
int32_t nTests = 1;
228+
bool raw = false;
229+
int32_t streams = 8;
230+
int32_t prime = 0;
231+
std::string outFileName = "benchmark_result";
232+
bool dumpChunks = false;
233+
};
234+
235+
template <class chunk_t>
236+
struct gpuState {
237+
int32_t getMaxChunks()
238+
{
239+
return (double)scratchSize / (chunkReservedGB * GB);
240+
}
241+
242+
int32_t getNKernelLaunches() { return iterations; }
243+
int32_t getStreamsPoolSize() { return streams; }
244+
245+
// Configuration
246+
size_t nMaxThreadsPerDimension;
247+
int32_t iterations;
248+
int32_t streams;
249+
250+
float chunkReservedGB; // Size of each partition (GB)
251+
252+
// General containers and state
253+
chunk_t* scratchPtr; // Pointer to scratch buffer
254+
size_t scratchSize; // Size of scratch area (B)
255+
std::vector<chunk_t*> partAddrOnHost; // Pointers to scratch partitions on host vector
256+
std::vector<std::pair<float, float>> testChunks; // Vector of definitions for arbitrary chunks
257+
258+
// Static info
259+
size_t totalMemory;
260+
size_t nMultiprocessors;
261+
size_t nMaxThreadsPerBlock;
262+
};
263+
264+
} // namespace benchmark
265+
} // namespace o2

0 commit comments

Comments
 (0)