|
12 | 12 |
|
13 | 13 | #include <cuda_runtime.h> |
14 | 14 |
|
| 15 | +#include <algorithm> |
15 | 16 | #include <unistd.h> |
16 | 17 | #include <vector> |
17 | 18 |
|
@@ -581,6 +582,87 @@ void TimeFrameGPU<NLayers>::createTrackITSExtDevice(const size_t nSeeds) |
581 | 582 | GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt))); |
582 | 583 | } |
583 | 584 |
|
| 585 | +template <int NLayers> |
| 586 | +void TimeFrameGPU<NLayers>::loadTrackExtensionStartStatesDevice() |
| 587 | +{ |
| 588 | + GPUTimer timer("loading track extension start states"); |
| 589 | + GPULog("gpu-transfer: loading {} track extension start states, for {:.2f} MB.", this->mTracks.size(), this->mTracks.size() * sizeof(o2::its::TrackExtensionStartState<NLayers>) / constants::MB); |
| 590 | + mTrackExtensionStartStatesDevice = nullptr; |
| 591 | + mTrackExtensionStartStates = bounded_vector<TrackExtensionStartState<NLayers>>(this->mTracks.size(), {}, this->getMemoryPool().get()); |
| 592 | + if (this->mTracks.empty()) { |
| 593 | + return; |
| 594 | + } |
| 595 | + for (size_t iTrack{0}; iTrack < this->mTracks.size(); ++iTrack) { |
| 596 | + const auto& track = this->mTracks[iTrack]; |
| 597 | + auto& state = mTrackExtensionStartStates[iTrack]; |
| 598 | + state.paramIn = track.getParamIn(); |
| 599 | + state.paramOut = track.getParamOut(); |
| 600 | + state.time = track.getTimeStamp(); |
| 601 | + state.chi2 = track.getChi2(); |
| 602 | + state.nClusters = track.getNClusters(); |
| 603 | + state.firstClusterLayer = static_cast<int>(track.getFirstClusterLayer()); |
| 604 | + state.lastClusterLayer = static_cast<int>(track.getLastClusterLayer()); |
| 605 | + for (int iLayer{0}; iLayer < NLayers; ++iLayer) { |
| 606 | + state.clusters[iLayer] = track.getClusterIndex(iLayer); |
| 607 | + } |
| 608 | + } |
| 609 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionStartStatesDevice), mTrackExtensionStartStates.size() * sizeof(o2::its::TrackExtensionStartState<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 610 | + GPUChkErrS(cudaMemcpy(mTrackExtensionStartStatesDevice, mTrackExtensionStartStates.data(), mTrackExtensionStartStates.size() * sizeof(o2::its::TrackExtensionStartState<NLayers>), cudaMemcpyHostToDevice)); |
| 611 | +} |
| 612 | + |
| 613 | +template <int NLayers> |
| 614 | +void TimeFrameGPU<NLayers>::createTrackExtensionCandidatesDevice(const size_t nTracks) |
| 615 | +{ |
| 616 | + GPUTimer timer("reserving track extension candidates"); |
| 617 | + const size_t nCandidates = nTracks * MaxTrackExtensionCandidatesPerTrack; |
| 618 | + GPULog("gpu-allocation: reserving {} track extension candidates, for {:.2f} MB.", nCandidates, nCandidates * sizeof(o2::its::TrackExtensionCandidate<NLayers>) / constants::MB); |
| 619 | + mTrackExtensionCandidates = bounded_vector<TrackExtensionCandidate<NLayers>>(nCandidates, {}, this->getMemoryPool().get()); |
| 620 | + mTrackExtensionCandidatesDevice = nullptr; |
| 621 | + mTrackExtensionCandidateOffsetsDevice = nullptr; |
| 622 | + if (mTrackExtensionCandidates.empty()) { |
| 623 | + return; |
| 624 | + } |
| 625 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionCandidatesDevice), nCandidates * sizeof(o2::its::TrackExtensionCandidate<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 626 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionCandidateOffsetsDevice), (nTracks + 1) * sizeof(int), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 627 | +} |
| 628 | + |
| 629 | +template <int NLayers> |
| 630 | +void TimeFrameGPU<NLayers>::createTrackExtensionScratchDevice(const int nThreads, const int beamWidth) |
| 631 | +{ |
| 632 | + GPUTimer timer("reserving track extension scratch"); |
| 633 | + const size_t nHypotheses = static_cast<size_t>(std::max(1, nThreads)) * std::max(1, beamWidth); |
| 634 | + GPULog("gpu-allocation: reserving {} track extension hypotheses per scratch buffer, for {:.2f} MB each.", nHypotheses, nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>) / constants::MB); |
| 635 | + mActiveTrackExtensionHypothesesDevice = nullptr; |
| 636 | + mNextTrackExtensionHypothesesDevice = nullptr; |
| 637 | + if (nHypotheses == 0) { |
| 638 | + return; |
| 639 | + } |
| 640 | + allocMem(reinterpret_cast<void**>(&mActiveTrackExtensionHypothesesDevice), nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 641 | + allocMem(reinterpret_cast<void**>(&mNextTrackExtensionHypothesesDevice), nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 642 | +} |
| 643 | + |
| 644 | +template <int NLayers> |
| 645 | +void TimeFrameGPU<NLayers>::createTrackExtensionResultsDevice(const size_t nTracks) |
| 646 | +{ |
| 647 | + GPUTimer timer("reserving fitted track extension results"); |
| 648 | + mNTrackExtensionResults = 0; |
| 649 | + if (nTracks == 0 || mTrackExtensionCandidateOffsetsDevice == nullptr) { |
| 650 | + mTrackExtensionResults = bounded_vector<TrackExtensionResult<NLayers>>(0, {}, this->getMemoryPool().get()); |
| 651 | + mTrackExtensionResultsDevice = nullptr; |
| 652 | + return; |
| 653 | + } |
| 654 | + int nResults{0}; |
| 655 | + GPUChkErrS(cudaMemcpy(&nResults, mTrackExtensionCandidateOffsetsDevice + nTracks, sizeof(int), cudaMemcpyDeviceToHost)); |
| 656 | + mNTrackExtensionResults = nResults; |
| 657 | + GPULog("gpu-allocation: reserving {} fitted track extension results, for {:.2f} MB.", mNTrackExtensionResults, mNTrackExtensionResults * sizeof(o2::its::TrackExtensionResult<NLayers>) / constants::MB); |
| 658 | + mTrackExtensionResults = bounded_vector<TrackExtensionResult<NLayers>>(mNTrackExtensionResults, {}, this->getMemoryPool().get()); |
| 659 | + mTrackExtensionResultsDevice = nullptr; |
| 660 | + if (mTrackExtensionResults.empty()) { |
| 661 | + return; |
| 662 | + } |
| 663 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionResultsDevice), mNTrackExtensionResults * sizeof(o2::its::TrackExtensionResult<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 664 | +} |
| 665 | + |
584 | 666 | template <int NLayers> |
585 | 667 | void TimeFrameGPU<NLayers>::downloadCellsDevice() |
586 | 668 | { |
@@ -627,6 +709,28 @@ void TimeFrameGPU<NLayers>::downloadTrackITSExtDevice() |
627 | 709 | GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, mTrackITSExt.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost)); |
628 | 710 | } |
629 | 711 |
|
| 712 | +template <int NLayers> |
| 713 | +void TimeFrameGPU<NLayers>::downloadTrackExtensionCandidatesDevice() |
| 714 | +{ |
| 715 | + GPUTimer timer("downloading track extension candidates"); |
| 716 | + GPULog("gpu-transfer: downloading {} track extension candidates, for {:.2f} MB.", mTrackExtensionCandidates.size(), mTrackExtensionCandidates.size() * sizeof(o2::its::TrackExtensionCandidate<NLayers>) / constants::MB); |
| 717 | + if (mTrackExtensionCandidates.empty()) { |
| 718 | + return; |
| 719 | + } |
| 720 | + GPUChkErrS(cudaMemcpy(mTrackExtensionCandidates.data(), mTrackExtensionCandidatesDevice, mTrackExtensionCandidates.size() * sizeof(o2::its::TrackExtensionCandidate<NLayers>), cudaMemcpyDeviceToHost)); |
| 721 | +} |
| 722 | + |
| 723 | +template <int NLayers> |
| 724 | +void TimeFrameGPU<NLayers>::downloadTrackExtensionResultsDevice() |
| 725 | +{ |
| 726 | + GPUTimer timer("downloading fitted track extension results"); |
| 727 | + GPULog("gpu-transfer: downloading {} fitted track extension results, for {:.2f} MB.", mTrackExtensionResults.size(), mTrackExtensionResults.size() * sizeof(o2::its::TrackExtensionResult<NLayers>) / constants::MB); |
| 728 | + if (mTrackExtensionResults.empty()) { |
| 729 | + return; |
| 730 | + } |
| 731 | + GPUChkErrS(cudaMemcpy(mTrackExtensionResults.data(), mTrackExtensionResultsDevice, mTrackExtensionResults.size() * sizeof(o2::its::TrackExtensionResult<NLayers>), cudaMemcpyDeviceToHost)); |
| 732 | +} |
| 733 | + |
630 | 734 | template <int NLayers> |
631 | 735 | void TimeFrameGPU<NLayers>::unregisterHostMemory(const int maxLayers) |
632 | 736 | { |
|
0 commit comments