AliceO2Group
diff --git a/‎Common/ML/include/ML/3rdparty/GPUORTFloat16.h‎
Lines changed: 10 additions & 2 deletions b/‎Common/ML/include/ML/3rdparty/GPUORTFloat16.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎GPU/GPUTracking/CMakeLists.txt‎
Lines changed: 4 additions & 9 deletions b/‎GPU/GPUTracking/CMakeLists.txt‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎GPU/GPUTracking/Definitions/GPUDefGPUParameters.h‎
Lines changed: 0 additions & 3 deletions b/‎GPU/GPUTracking/Definitions/GPUDefGPUParameters.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx‎
Lines changed: 6 additions & 4 deletions b/‎GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx‎
Lines changed: 6 additions & 4 deletions
@@ -16,6 +16,7 @@
 #endif
 
 #include "GPUCommonDef.h"
+#include "GPUCommonMath.h"
 
 namespace o2
 {
@@ -530,11 +531,14 @@ template <class Derived>
 GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   uint16_t result;
-  if (std::isnan(v)) {
+  if (o2::gpu::CAMath::IsNaN(v)) {
     result = kPositiveQNaNBits;
   } else {
     auto get_msb_half = [](float fl) {
       uint16_t result;
+#ifdef GPUCA_GPUCODE
+      result = 0;
+#else
 #ifdef __cpp_if_constexpr
       if constexpr (detail::endian::native == detail::endian::little)
 #else
@@ -557,6 +561,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     U32 += (upper_bits & 1) + kRoundToNearest;
     result = get_msb_half(F32);
   }
+#endif
   return result;
 }
 
@@ -567,6 +572,9 @@ GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
     return std::numeric_limits<float>::quiet_NaN();
   }
   float result;
+#ifdef GPUCA_GPUCODE
+  result = 0; // Fixme: implement memcpy
+#else
   char* const first = reinterpret_cast<char*>(&result);
   char* const second = first + sizeof(uint16_t);
 #ifdef __cpp_if_constexpr
@@ -581,6 +589,7 @@ GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
     std::memcpy(first, &val, sizeof(uint16_t));
     std::memset(second, 0, sizeof(uint16_t));
   }
+#endif
   return result;
 }
 
@@ -872,5 +881,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
 } // namespace OrtDataType
 
 } // namespace o2
-
 #endif
@@ -276,15 +276,11 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                         O2::GPUCommon
                                         O2::ReconstructionDataFormats
                                         O2::TPCFastTransformation
+                                        O2::ML
                   PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                   SOURCES ${SRCS_DATATYPES})
-  if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-    add_compile_definitions(GPUCA_HAS_ONNX=1)
-    target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX)
-    target_link_libraries(${targetName} PUBLIC O2::ML)
-  else()
-    target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
-  endif()
+  add_compile_definitions(GPUCA_HAS_ONNX=1)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX)
 
   o2_target_root_dictionary(GPUDataTypes
                             HEADERS ${HDRS_CINT_DATATYPES} ${HDRS_CINT_O2_ADDITIONAL}
@@ -350,7 +346,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                          LABELS its COMPILE_ONLY)
 
   add_subdirectory(Interface)
-
 endif()
 
 # Main CMake part for Standalone
@@ -422,4 +417,4 @@ endif()
 
 if(${GPUCA_NO_FAST_MATH})
   target_compile_definitions(${targetName} PUBLIC GPUCA_NO_FAST_MATH)
-endif()
+endif()
@@ -81,7 +81,6 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -148,7 +147,6 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -215,7 +213,6 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 448
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
 
@@ -858,7 +858,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
+#ifdef GPUCA_HAS_ONNX
         GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+#endif
         GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
 
         if (doGPU) {
@@ -929,18 +931,18 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.inferenceNetworkClass(clustererNN, iSize, evalDtype, batchStart);
+            nnApplication.inferenceNetwork(clustererNN.model_class, clustererNN, iSize, clusterer.modelProbabilities);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
             } else {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.inferenceNetworkReg1(clustererNN, iSize, evalDtype, batchStart);
+              nnApplication.inferenceNetwork(clustererNN.model_reg_1, clustererNN, iSize, clusterer.outputDataReg1);
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                nnApplication.inferenceNetworkReg2(clustererNN, iSize, evalDtype, batchStart);
+                nnApplication.inferenceNetwork(clustererNN.model_reg_2, clustererNN, iSize, clusterer.outputDataReg2);
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
@@ -1168,4 +1170,4 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
 #endif
   return 0;
-}
+}