1+ // Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+ // See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+ // All rights not expressly granted are reserved.
4+ //
5+ // This software is distributed under the terms of the GNU General Public
6+ // License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+ //
8+ // In applying this license CERN does not waive the privileges and immunities
9+ // granted to it by virtue of its status as an Intergovernmental Organization
10+ // or submit itself to any jurisdiction.
11+ // /
12+ // / \file Common.h
13+ // / \author: mconcas@cern.ch
14+
15+ #ifndef GPU_BENCHMARK_UTILS_H
16+ #define GPU_BENCHMARK_UTILS_H
17+
18+ #if defined(__HIPCC__)
19+ #include " hip/hip_runtime.h"
20+ #endif
21+
22+ #include < iostream>
23+ #include < sstream>
24+ #include < iomanip>
25+ #include < typeinfo>
26+ #include < boost/program_options.hpp>
27+ #include < vector>
28+ #include < string>
29+ #include < cmath>
30+
31+ #define KNRM " \x1B [0m"
32+ #define KRED " \x1B [31m"
33+ #define KGRN " \x1B [32m"
34+ #define KYEL " \x1B [33m"
35+ #define configLU " \x1B [34m"
36+ #define KMAG " \x1B [35m"
37+ #define KCYN " \x1B [36m"
38+ #define KWHT " \x1B [37m"
39+
40+ #define GB (1024 * 1024 * 1024 )
41+
42+ #define failed (...) \
43+ printf (" %serror: " , KRED); \
44+ printf (__VA_ARGS__); \
45+ printf (" \n " ); \
46+ printf (" error: TEST FAILED\n %s" , KNRM); \
47+ exit (EXIT_FAILURE);
48+ #endif
49+
50+ template <typename T>
51+ void discardResult (const T&)
52+ {
53+ }
54+
55+ enum class Test {
56+ Read,
57+ Write,
58+ Copy,
59+ RandomRead,
60+ RandomWrite,
61+ RandomCopy
62+ };
63+
64+ inline std::ostream& operator <<(std::ostream& os, Test test)
65+ {
66+ switch (test) {
67+ case Test::Read:
68+ os << " read" ;
69+ break ;
70+ case Test::Write:
71+ os << " write" ;
72+ break ;
73+ case Test::Copy:
74+ os << " copy" ;
75+ break ;
76+ case Test::RandomRead:
77+ os << " random read" ;
78+ break ;
79+ case Test::RandomWrite:
80+ os << " random write" ;
81+ break ;
82+ case Test::RandomCopy:
83+ os << " random copy" ;
84+ break ;
85+ }
86+ return os;
87+ }
88+
89+ enum class Mode {
90+ Sequential,
91+ Concurrent,
92+ Distributed
93+ };
94+
95+ inline std::ostream& operator <<(std::ostream& os, Mode mode)
96+ {
97+ switch (mode) {
98+ case Mode::Sequential:
99+ os << " sequential" ;
100+ break ;
101+ case Mode::Concurrent:
102+ os << " concurrent" ;
103+ break ;
104+ case Mode::Distributed:
105+ os << " distributed" ;
106+ break ;
107+ }
108+ return os;
109+ }
110+
111+ enum class KernelConfig {
112+ Single,
113+ Multi,
114+ All,
115+ Manual
116+ };
117+
118+ inline std::ostream& operator <<(std::ostream& os, KernelConfig config)
119+ {
120+ switch (config) {
121+ case KernelConfig::Single:
122+ os << " single" ;
123+ break ;
124+ case KernelConfig::Multi:
125+ os << " multiple" ;
126+ break ;
127+ case KernelConfig::All:
128+ os << " all" ;
129+ break ;
130+ case KernelConfig::Manual:
131+ os << " manual" ;
132+ break ;
133+ }
134+ return os;
135+ }
136+
137+ template <class T >
138+ inline std::string getType ()
139+ {
140+ if (typeid (T).name () == typeid (int8_t ).name ()) {
141+ return std::string{" int8_t" };
142+ }
143+ if (typeid (T).name () == typeid (size_t ).name ()) {
144+ return std::string{" uint64_t" };
145+ }
146+ if (typeid (T).name () == typeid (int32_t ).name ()) {
147+ return std::string{" int32_t" };
148+ }
149+ if (typeid (T).name () == typeid (int4).name ()) {
150+ return std::string{" int4" };
151+ }
152+ return std::string{" unknown" };
153+ }
154+
155+ inline std::string getTestName (Mode mode, Test test, KernelConfig blocks)
156+ {
157+ std::string tname;
158+ tname += (mode == Mode::Sequential) ? " seq_" : " conc_" ;
159+ tname += (test == Test::Read) ? " read_" : (test == Test::Write) ? " write_"
160+ : " copy_" ;
161+ tname += (blocks == KernelConfig::Single) ? " SB" : " MB" ;
162+ return tname;
163+ }
164+
165+ // Return pointer to custom offset (GB)
166+ template <class chunk_t >
167+ inline chunk_t * getCustomPtr (chunk_t * scratchPtr, float startGB)
168+ {
169+ return reinterpret_cast <chunk_t *>(reinterpret_cast <char *>(scratchPtr) + (static_cast <size_t >(GB * startGB) & 0xFFFFFFFFFFFFF000 ));
170+ }
171+
172+ inline float computeThroughput (Test test, float result, float chunkSizeGB, int32_t ntests)
173+ {
174+ // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html
175+ // Eff_bandwidth (GB/s) = (B_r + B_w) / (~1e9 * Time (s))
176+
177+ return 1e3 * chunkSizeGB * (float )ntests / result;
178+ }
179+
180+ template <class chunk_t >
181+ inline size_t getBufferCapacity (float chunkSizeGB, int32_t prime)
182+ {
183+ auto chunkCapacity = (static_cast <size_t >(GB * chunkSizeGB) & 0xFFFFFFFFFFFFF000 ) / sizeof (chunk_t );
184+ if (!prime) {
185+ return chunkCapacity;
186+ } else {
187+ return (chunkCapacity % prime == 0 ) ? (chunkCapacity - 0x1000 ) : chunkCapacity;
188+ }
189+ }
190+
191+ inline bool is_prime (const int32_t n)
192+ {
193+ bool isPrime = true ;
194+ if (n == 0 || n == 1 ) {
195+ isPrime = false ;
196+ } else {
197+ for (int32_t i = 2 ; i <= sqrt (n); ++i) {
198+ if (n % i == 0 ) {
199+ isPrime = false ;
200+ break ;
201+ }
202+ }
203+ }
204+
205+ return isPrime;
206+ }
207+
208+ namespace o2
209+ {
210+ namespace benchmark
211+ {
212+ struct benchmarkOpts {
213+ benchmarkOpts () = default ;
214+
215+ int32_t deviceId = 0 ;
216+ std::vector<Test> tests = {Test::Read, Test::Write, Test::Copy};
217+ std::vector<Mode> modes = {Mode::Sequential, Mode::Concurrent};
218+ std::vector<KernelConfig> pools = {KernelConfig::Single, KernelConfig::Multi};
219+ std::vector<std::string> dtypes = {" int8_t" , " int32_t" , " uint64_t" };
220+ std::vector<std::pair<float , float >> testChunks;
221+ float chunkReservedGB = 1 .f;
222+ float threadPoolFraction = 1 .f;
223+ float freeMemoryFractionToAllocate = 0 .95f ;
224+ int32_t numThreads = -1 ;
225+ int32_t numBlocks = -1 ;
226+ int32_t kernelLaunches = 1 ;
227+ int32_t nTests = 1 ;
228+ bool raw = false ;
229+ int32_t streams = 8 ;
230+ int32_t prime = 0 ;
231+ std::string outFileName = " benchmark_result" ;
232+ bool dumpChunks = false ;
233+ };
234+
235+ template <class chunk_t >
236+ struct gpuState {
237+ int32_t getMaxChunks ()
238+ {
239+ return (double )scratchSize / (chunkReservedGB * GB);
240+ }
241+
242+ int32_t getNKernelLaunches () { return iterations; }
243+ int32_t getStreamsPoolSize () { return streams; }
244+
245+ // Configuration
246+ size_t nMaxThreadsPerDimension;
247+ int32_t iterations;
248+ int32_t streams;
249+
250+ float chunkReservedGB; // Size of each partition (GB)
251+
252+ // General containers and state
253+ chunk_t * scratchPtr; // Pointer to scratch buffer
254+ size_t scratchSize; // Size of scratch area (B)
255+ std::vector<chunk_t *> partAddrOnHost; // Pointers to scratch partitions on host vector
256+ std::vector<std::pair<float , float >> testChunks; // Vector of definitions for arbitrary chunks
257+
258+ // Static info
259+ size_t totalMemory;
260+ size_t nMultiprocessors;
261+ size_t nMaxThreadsPerBlock;
262+ };
263+
264+ } // namespace benchmark
265+ } // namespace o2
0 commit comments