@@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
306306}
307307
308308template <int nLayers>
309- void TimeFrameGPU<nLayers>::createNeighboursDevice ()
309+ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice ()
310310{
311- START_GPU_STREAM_TIMER (mGpuStreams [0 ].get (), " loading cell seeds" );
311+ START_GPU_STREAM_TIMER (mGpuStreams [0 ].get (), " creating cells neighbours" );
312+ // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
313+ allocMemAsync (reinterpret_cast <void **>(&mCellsDeviceArray ), (nLayers - 2 ) * sizeof (CellSeed*), nullptr , getExtAllocator ());
314+ checkGPUError (cudaHostRegister (mCellsDevice .data (), (nLayers - 2 ) * sizeof (CellSeed*), cudaHostRegisterPortable));
315+ checkGPUError (cudaMemcpyAsync (mCellsDeviceArray , mCellsDevice .data (), (nLayers - 2 ) * sizeof (CellSeed*), cudaMemcpyHostToDevice, mGpuStreams [0 ].get ()));
312316 for (auto iLayer{0 }; iLayer < nLayers - 2 ; ++iLayer) {
313317 LOGP (debug, " gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB." , mNCells [iLayer], iLayer, mNCells [iLayer] * sizeof (CellSeed) / MB);
314318 allocMemAsync (reinterpret_cast <void **>(&mNeighboursIndexTablesDevice [iLayer]), (mNCells [iLayer] + 1 ) * sizeof (int ), nullptr , getExtAllocator ());
315319 checkGPUError (cudaMemsetAsync (mNeighboursIndexTablesDevice [iLayer], 0 , (mNCells [iLayer] + 1 ) * sizeof (int ), mGpuStreams [0 ].get ()));
316320 }
317- allocMemAsync (reinterpret_cast <void **>(&mCellsDeviceArray ), (nLayers - 2 ) * sizeof (CellSeed*), nullptr , getExtAllocator ());
318- checkGPUError (cudaHostRegister (mCellsDevice .data (), (nLayers - 2 ) * sizeof (CellSeed*), cudaHostRegisterPortable));
319- checkGPUError (cudaMemcpyAsync (mCellsDeviceArray , mCellsDevice .data (), (nLayers - 2 ) * sizeof (CellSeed*), cudaMemcpyHostToDevice, mGpuStreams [0 ].get ()));
321+ STOP_GPU_STREAM_TIMER (mGpuStreams [0 ].get ());
322+ }
323+
324+ template <int nLayers>
325+ void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
326+ {
327+ START_GPU_STREAM_TIMER (mGpuStreams [0 ].get (), " reserving neighboursLUT" );
328+ LOGP (debug, " gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB." , nCells + 1 , layer, (nCells + 1 ) * sizeof (int ) / MB);
329+ allocMemAsync (reinterpret_cast <void **>(&mNeighboursLUTDevice [layer]), (nCells + 1 ) * sizeof (int ), nullptr , getExtAllocator ()); // We need one element more to move exc -> inc
330+ checkGPUError (cudaMemsetAsync (mNeighboursLUTDevice [layer], 0 , (nCells + 1 ) * sizeof (int ), mGpuStreams [0 ].get ()));
320331 STOP_GPU_STREAM_TIMER (mGpuStreams [0 ].get ());
321332}
322333
@@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
400411 START_GPU_STREAM_TIMER (mGpuStreams [0 ].get (), " reserving neighbours" );
401412 mCellsNeighbours [layer].clear ();
402413 mCellsNeighbours [layer].resize (neighbours.size ());
414+ LOGP (debug, " gpu-allocation: reserving {} neighbours (pairs), for {} MB." , neighbours.size (), neighbours.size () * sizeof (gpuPair<int , int >) / MB);
415+ allocMemAsync (reinterpret_cast <void **>(&mNeighbourPairsDevice [layer]), neighbours.size () * sizeof (gpuPair<int , int >), &(mGpuStreams [0 ]), getExtAllocator ());
416+ checkGPUError (cudaMemsetAsync (mNeighbourPairsDevice [layer], -1 , neighbours.size () * sizeof (gpuPair<int , int >), mGpuStreams [0 ].get ()));
403417 LOGP (debug, " gpu-allocation: reserving {} neighbours, for {} MB." , neighbours.size (), neighbours.size () * sizeof (gpuPair<int , int >) / MB);
404- allocMemAsync (reinterpret_cast <void **>(&mNeighboursDevice [layer]), neighbours.size () * sizeof (gpuPair<int , int >), &(mGpuStreams [0 ]), getExtAllocator ());
405- checkGPUError (cudaMemsetAsync (mNeighboursDevice [layer], -1 , neighbours.size () * sizeof (gpuPair<int , int >), mGpuStreams [0 ].get ()));
418+ allocMemAsync (reinterpret_cast <void **>(&mNeighboursDevice [layer]), neighbours.size () * sizeof (int ), &(mGpuStreams [0 ]), getExtAllocator ());
406419 STOP_GPU_STREAM_TIMER (mGpuStreams [0 ].get ());
407420}
408421
409422template <int nLayers>
410- void TimeFrameGPU<nLayers>::createNeighboursLUTDevice( const int layer, const unsigned int nCells )
423+ void TimeFrameGPU<nLayers>::createNeighboursDeviceArray( )
411424{
412- START_GPU_STREAM_TIMER (mGpuStreams [0 ].get (), " reserving neighboursLUT" );
413- LOGP (debug, " gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB." , nCells + 1 , layer, (nCells + 1 ) * sizeof (int ) / MB);
414- allocMemAsync (reinterpret_cast <void **>(&mNeighboursLUTDevice [layer]), (nCells + 1 ) * sizeof (int ), nullptr , getExtAllocator ()); // We need one element more to move exc -> inc
415- checkGPUError (cudaMemsetAsync (mNeighboursLUTDevice [layer], 0 , (nCells + 1 ) * sizeof (int ), mGpuStreams [0 ].get ()));
425+ START_GPU_STREAM_TIMER (mGpuStreams [0 ].get (), " reserving neighbours" );
426+ allocMemAsync (reinterpret_cast <void **>(&mNeighboursDeviceArray ), (nLayers - 2 ) * sizeof (int *), &(mGpuStreams [0 ]), getExtAllocator ());
427+ checkGPUError (cudaMemcpyAsync (mNeighboursDeviceArray , mNeighboursDevice .data (), (nLayers - 2 ) * sizeof (int *), cudaMemcpyHostToDevice, mGpuStreams [0 ].get ()));
416428 STOP_GPU_STREAM_TIMER (mGpuStreams [0 ].get ());
417429}
418430
@@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
459471 START_GPU_STREAM_TIMER (mGpuStreams [0 ].get (), fmt::format (" downloading neighbours from layer {}" , layer));
460472 LOGP (debug, " gpu-transfer: downloading {} neighbours, for {} MB." , neighbours[layer].size (), neighbours[layer].size () * sizeof (std::pair<int , int >) / MB);
461473 // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
462- checkGPUError (cudaMemcpyAsync (neighbours[layer].data (), mNeighboursDevice [layer], neighbours[layer].size () * sizeof (gpuPair<int , int >), cudaMemcpyDeviceToHost, mGpuStreams [0 ].get ()));
474+ checkGPUError (cudaMemcpyAsync (neighbours[layer].data (), mNeighbourPairsDevice [layer], neighbours[layer].size () * sizeof (gpuPair<int , int >), cudaMemcpyDeviceToHost, mGpuStreams [0 ].get ()));
463475}
464476
465477template <int nLayers>
0 commit comments