@@ -349,26 +349,20 @@ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
349349{
350350 GPUTimer timer (mGpuStreams [0 ], " creating cells neighbours" );
351351 // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
352- allocMemAsync (reinterpret_cast <void **>(&mCellsDeviceArray ), (nLayers - 2 ) * sizeof (CellSeed*), mGpuStreams [0 ], this ->getExtAllocator ());
353- GPUChkErrS (cudaHostRegister (mCellsDevice .data (), (nLayers - 2 ) * sizeof (CellSeed*), cudaHostRegisterPortable));
354- GPUChkErrS (cudaMemcpyAsync (mCellsDeviceArray , mCellsDevice .data (), (nLayers - 2 ) * sizeof (CellSeed*), cudaMemcpyHostToDevice, mGpuStreams [0 ].get ()));
355352 for (auto iLayer{0 }; iLayer < nLayers - 2 ; ++iLayer) {
356353 GPULog (" gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB." , mNCells [iLayer], iLayer, mNCells [iLayer] * sizeof (CellSeed) / constants::MB);
357- allocMemAsync (reinterpret_cast <void **>(&mNeighboursIndexTablesDevice [iLayer]), (mNCells [iLayer] + 1 ) * sizeof (int ), mGpuStreams [0 ], this ->getExtAllocator ());
358- GPUChkErrS (cudaMemsetAsync (mNeighboursIndexTablesDevice [iLayer], 0 , (mNCells [iLayer] + 1 ) * sizeof (int ), mGpuStreams [0 ].get ()));
359- if (iLayer < nLayers - 3 ) {
360- mNNeighbours [iLayer] = 0 ;
361- }
354+ allocMemAsync (reinterpret_cast <void **>(&mNeighboursIndexTablesDevice [iLayer]), (mNCells [iLayer] + 1 ) * sizeof (int ), mGpuStreams [iLayer], this ->getExtAllocator ());
355+ GPUChkErrS (cudaMemsetAsync (mNeighboursIndexTablesDevice [iLayer], 0 , (mNCells [iLayer] + 1 ) * sizeof (int ), mGpuStreams [iLayer].get ()));
362356 }
363357}
364358
365359template <int nLayers>
366360void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
367361{
368- GPUTimer timer (mGpuStreams [0 ], " reserving neighboursLUT" );
362+ GPUTimer timer (mGpuStreams [layer ], " reserving neighboursLUT" );
369363 GPULog (" gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB." , nCells + 1 , layer, (nCells + 1 ) * sizeof (int ) / constants::MB);
370- allocMemAsync (reinterpret_cast <void **>(&mNeighboursLUTDevice [layer]), (nCells + 1 ) * sizeof (int ), mGpuStreams [0 ], this ->getExtAllocator ()); // We need one element more to move exc -> inc
371- GPUChkErrS (cudaMemsetAsync (mNeighboursLUTDevice [layer], 0 , (nCells + 1 ) * sizeof (int ), mGpuStreams [0 ].get ()));
364+ allocMemAsync (reinterpret_cast <void **>(&mNeighboursLUTDevice [layer]), (nCells + 1 ) * sizeof (int ), mGpuStreams [layer ], this ->getExtAllocator ()); // We need one element more to move exc -> inc
365+ GPUChkErrS (cudaMemsetAsync (mNeighboursLUTDevice [layer], 0 , (nCells + 1 ) * sizeof (int ), mGpuStreams [layer ].get ()));
372366}
373367
374368template <int nLayers>
@@ -382,8 +376,6 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
382376 GPUChkErrS (cudaMemsetAsync (mNeighboursIndexTablesDevice [iLayer], 0 , (this ->mCells [iLayer].size () + 1 ) * sizeof (int ), mGpuStreams [iLayer].get ()));
383377 GPUChkErrS (cudaMemcpyAsync (mCellsDevice [iLayer], this ->mCells [iLayer].data (), this ->mCells [iLayer].size () * sizeof (CellSeed), cudaMemcpyHostToDevice, mGpuStreams [iLayer].get ()));
384378 }
385- allocMemAsync (reinterpret_cast <void **>(&mCellsDeviceArray ), (nLayers - 2 ) * sizeof (CellSeed*), mGpuStreams [0 ], this ->getExtAllocator ());
386- GPUChkErrS (cudaMemcpyAsync (mCellsDeviceArray , mCellsDevice .data (), (nLayers - 2 ) * sizeof (CellSeed*), cudaMemcpyHostToDevice, mGpuStreams [0 ].get ()));
387379}
388380
389381template <int nLayers>
@@ -441,35 +433,15 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds
441433}
442434
443435template <int nLayers>
444- void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours )
436+ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
445437{
446- GPUTimer timer (mGpuStreams [0 ], " reserving neighbours" );
438+ GPUTimer timer (mGpuStreams [layer], " reserving neighbours" );
439+ GPUChkErrS (cudaMemcpyAsync (&(this ->mNNeighbours [layer]), &(mNeighboursLUTDevice [layer][this ->mNCells [layer + 1 ] - 1 ]), sizeof (unsigned int ), cudaMemcpyDeviceToHost, mGpuStreams [layer].get ()));
447440 GPULog (" gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB." , nNeighbours, nNeighbours * sizeof (gpuPair<int , int >) / constants::MB);
448- allocMemAsync (reinterpret_cast <void **>(&mNeighbourPairsDevice [layer]), nNeighbours * sizeof (gpuPair<int , int >), mGpuStreams [0 ], this ->getExtAllocator ());
449- GPUChkErrS (cudaMemsetAsync (mNeighbourPairsDevice [layer], -1 , nNeighbours * sizeof (gpuPair<int , int >), mGpuStreams [0 ].get ()));
441+ allocMemAsync (reinterpret_cast <void **>(&mNeighbourPairsDevice [layer]), ( this -> mNNeighbours [layer]) * sizeof (gpuPair<int , int >), mGpuStreams [layer ], this ->getExtAllocator ());
442+ GPUChkErrS (cudaMemsetAsync (mNeighbourPairsDevice [layer], -1 , ( this -> mNNeighbours [layer]) * sizeof (gpuPair<int , int >), mGpuStreams [layer ].get ()));
450443 GPULog (" gpu-allocation: reserving {} neighbours, for {:.2f} MB." , nNeighbours, nNeighbours * sizeof (gpuPair<int , int >) / constants::MB);
451- allocMemAsync (reinterpret_cast <void **>(&mNeighboursDevice [layer]), nNeighbours * sizeof (int ), mGpuStreams [0 ], this ->getExtAllocator ());
452- }
453-
454- template <int nLayers>
455- void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int , int >>& neighbours)
456- {
457- GPUTimer timer (mGpuStreams [0 ], " reserving neighbours" );
458- this ->mCellsNeighbours [layer].clear ();
459- this ->mCellsNeighbours [layer].resize (neighbours.size ());
460- GPULog (" gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB." , neighbours.size (), neighbours.size () * sizeof (gpuPair<int , int >) / constants::MB);
461- allocMemAsync (reinterpret_cast <void **>(&mNeighbourPairsDevice [layer]), neighbours.size () * sizeof (gpuPair<int , int >), mGpuStreams [0 ], this ->getExtAllocator ());
462- GPUChkErrS (cudaMemsetAsync (mNeighbourPairsDevice [layer], -1 , neighbours.size () * sizeof (gpuPair<int , int >), mGpuStreams [0 ].get ()));
463- GPULog (" gpu-allocation: reserving {} neighbours, for {:.2f} MB." , neighbours.size (), neighbours.size () * sizeof (gpuPair<int , int >) / constants::MB);
464- allocMemAsync (reinterpret_cast <void **>(&mNeighboursDevice [layer]), neighbours.size () * sizeof (int ), mGpuStreams [0 ], this ->getExtAllocator ());
465- }
466-
467- template <int nLayers>
468- void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
469- {
470- GPUTimer timer (mGpuStreams [0 ], " reserving neighbours" );
471- allocMemAsync (reinterpret_cast <void **>(&mNeighboursDeviceArray ), (nLayers - 2 ) * sizeof (int *), mGpuStreams [0 ], this ->getExtAllocator ());
472- GPUChkErrS (cudaMemcpyAsync (mNeighboursDeviceArray , mNeighboursDevice .data (), (nLayers - 2 ) * sizeof (int *), cudaMemcpyHostToDevice, mGpuStreams [0 ].get ()));
444+ allocMemAsync (reinterpret_cast <void **>(&mNeighboursDevice [layer]), (this ->mNNeighbours [layer]) * sizeof (int ), mGpuStreams [layer], this ->getExtAllocator ());
473445}
474446
475447template <int nLayers>
@@ -532,15 +504,6 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>&
532504 GPUChkErrS (cudaHostUnregister (seeds.data ()));
533505}
534506
535- template <int nLayers>
536- void TimeFrameGPU<nLayers>::unregisterRest()
537- {
538- GPUTimer timer (mGpuStreams [0 ], " unregistering rest of the host memory" );
539- GPULog (" unregistering rest of the host memory..." );
540- GPUChkErrS (cudaHostUnregister (mCellsDevice .data ()));
541- // GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
542- }
543-
544507template <int nLayers>
545508void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
546509{
0 commit comments