Adjusting for comments

ChSonnabend · ChSonnabend · commit cc6c05c81c12 · 2025-03-12T11:47:20.000+01:00
diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -275,7 +275,7 @@ union float32_bits {
 }; // namespace detail
 
 template <class Derived>
-GPUd() inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
+GPUdi() constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   detail::float32_bits f{};
   f.f = v;
@@ -324,7 +324,7 @@ GPUd() inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noe
 }
 
 template <class Derived>
-GPUd() inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
+GPUdi() float Float16Impl<Derived>::ToFloatImpl() const noexcept
 {
   constexpr detail::float32_bits magic = {113 << 23};
   constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
@@ -528,7 +528,7 @@ struct BFloat16Impl {
 };
 
 template <class Derived>
-GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
+GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   uint16_t result;
   if (o2::gpu::CAMath::IsNaN(v)) {
@@ -537,7 +537,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     auto get_msb_half = [](float fl) {
       uint16_t result;
 #ifdef GPUCA_GPUCODE
-      result = 0;
+      o2::gpu::CAMath::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
 #else
 #ifdef __cpp_if_constexpr
       if constexpr (detail::endian::native == detail::endian::little)
@@ -547,6 +547,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
       {
         std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
       } else {
+        static_assert(false, "ERROR!!!");
         std::memcpy(&result, &fl, sizeof(uint16_t));
       }
       return result;
@@ -566,17 +567,18 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     }
 
     template <class Derived>
-    GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+    GPUdi() float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
     {
       if (IsNaN()) {
-        return std::numeric_limits<float>::quiet_NaN();
+        return o2::gpu::CAMath::QuietNaN();
       }
       float result;
+      char* const first = reinterpret_cast<char*>(&result);
+      char* const second = first + sizeof(uint16_t);
 #ifdef GPUCA_GPUCODE
-      result = 0; // Fixme: implement memcpy
+      first[0] = first[1] = 0;
+      o2::gpu::CAMath::memcpy(second, &val, sizeof(uint16_t));
 #else
-  char* const first = reinterpret_cast<char*>(&result);
-  char* const second = first + sizeof(uint16_t);
 #ifdef __cpp_if_constexpr
   if constexpr (detail::endian::native == detail::endian::little)
 #else
@@ -726,7 +728,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
       /// <summary>
       /// User defined conversion operator. Converts Float16_t to float.
       /// </summary>
-      explicit operator float() const noexcept { return ToFloat(); }
+      GPUdi() explicit operator float() const noexcept { return ToFloat(); }
 
       using Base::operator==;
       using Base::operator!=;
@@ -867,7 +869,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
       /// <summary>
       /// User defined conversion operator. Converts BFloat16_t to float.
       /// </summary>
-      explicit operator float() const noexcept { return ToFloat(); }
+      GPUdi() explicit operator float() const noexcept { return ToFloat(); }
 
       // We do not have an inherited impl for the below operators
       // as the internal class implements them a little differently
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -899,6 +899,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
           }
 
+          int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
+          clustererNN.nnClusterizerDtype = evalDtype;
+
           // Settings for the NN evaluation
           clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
           clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
@@ -920,7 +923,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           float time_clusterizer = 0, time_fill = 0;
-          int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
 
           for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;

Original file line number	Diff line number	Diff line change
`@@ -899,6 +899,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`899`	`899`	`clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;`
`900`	`900`	`}`
`901`	`901`
	`902`	`+ int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;`
	`903`	`+ clustererNN.nnClusterizerDtype = evalDtype;`
	`904`	`+`
`902`	`905`	`// Settings for the NN evaluation`
`903`	`906`	`clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;`
`904`	`907`	`clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;`
`@@ -920,7 +923,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`920`	`923`	`}`
`921`	`924`
`922`	`925`	`float time_clusterizer = 0, time_fill = 0;`
`923`		`- int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;`
`924`	`926`
`925`	`927`	`for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {`
`926`	`928`	`uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;`