diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 50888c676df77..eacf514c7a91d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -32,14 +32,13 @@ #include "ITStracking/Cluster.h" #include "ITStracking/Cell.h" #include "DataFormatsITS/TrackITS.h" - #include "ITStrackingGPU/TrackingKernels.h" #include "ITStrackingGPU/Utils.h" +#include "utils/strtag.h" // O2 track model #include "ReconstructionDataFormats/Track.h" #include "DetectorsBase/Propagator.h" -#include "utils/strtag.h" using namespace o2::track; namespace o2::its @@ -1108,18 +1107,12 @@ void processNeighboursHandler(const int startLayer, const int nThreads) { constexpr uint64_t Tag = qStr2Tag("ITS_PNH1"); - - // allocators used + alloc->pushTagOnStack(Tag); auto allocInt = gpu::TypedAllocator(alloc); auto allocCellSeed = gpu::TypedAllocator>(alloc); - // use sync_policy, this part cannot be run async but tell thrust to use the allocator - auto sync_policy = THRUST_NAMESPACE::par(gpu::TypedAllocator(alloc)); - - // put initial computation on Tag1 - alloc->pushTagOnStack(Tag); - - // start processing of cells thrust::device_vector> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); + auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator(alloc)).on(gpu::Stream::DefaultStream); + gpu::processNeighboursKernel<<>>( startLayer, startLevel, @@ -1138,10 +1131,10 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - thrust::exclusive_scan(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin()); - auto foundSeeds{foundSeedsTable.back()}; - thrust::device_vector> updatedCellId(foundSeeds, 0, allocInt); - thrust::device_vector, gpu::TypedAllocator>> updatedCellSeed(foundSeeds, allocCellSeed); + thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin()); + + thrust::device_vector> updatedCellId(foundSeedsTable.back(), 0, allocInt); + thrust::device_vector, gpu::TypedAllocator>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed); gpu::processNeighboursKernel<<>>( startLayer, startLevel, @@ -1160,41 +1153,20 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); + GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream)); - // now do inward steps until stop is reached int level = startLevel; - - // Host buffers to break dependency - // FIXME: these should be on our memory resource! - std::vector hostCellId; - std::vector> hostCellSeed; - - // inward loop + thrust::device_vector> lastCellId(allocInt); + thrust::device_vector, gpu::TypedAllocator>> lastCellSeed(allocCellSeed); for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { - // copy current results to host - hostCellId.resize(updatedCellId.size()); - hostCellSeed.resize(updatedCellSeed.size()); - thrust::copy(updatedCellId.begin(), updatedCellId.end(), hostCellId.begin()); - thrust::copy(updatedCellSeed.begin(), updatedCellSeed.end(), hostCellSeed.begin()); - - auto lastCellSeedSize{hostCellSeed.size()}; - // but before we clear the memory, and immediately start a new block - alloc->popTagOffStack(Tag); - alloc->pushTagOnStack(Tag); - - // based on the previous step's result create new LUT and zero it - thrust::device_vector>(allocInt).swap(foundSeedsTable); - foundSeedsTable.resize(lastCellSeedSize + 1); - thrust::fill(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0); - - // recreate lastCell vectors from host - thrust::device_vector> lastCellId(hostCellId.begin(), hostCellId.end(), allocInt); - thrust::device_vector, gpu::TypedAllocator>> lastCellSeed(hostCellSeed.begin(), hostCellSeed.end(), allocCellSeed); - // also create new vectors on new block + lastCellSeed.swap(updatedCellSeed); + lastCellId.swap(updatedCellId); thrust::device_vector, gpu::TypedAllocator>>(allocCellSeed).swap(updatedCellSeed); thrust::device_vector>(allocInt).swap(updatedCellId); + auto lastCellSeedSize{lastCellSeed.size()}; + foundSeedsTable.resize(lastCellSeedSize + 1); + thrust::fill(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0); - // start step gpu::processNeighboursKernel<<>>( iLayer, --level, @@ -1213,13 +1185,14 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - // how many new seeds where found - thrust::exclusive_scan(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin()); - foundSeeds = foundSeedsTable.back(); - // do a resize, we don't need to set the memory now since we know that all of these are written to - // Note though this does not clear the memory... + thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin()); + + auto foundSeeds{foundSeedsTable.back()}; updatedCellId.resize(foundSeeds); + thrust::fill(nosync_policy, updatedCellId.begin(), updatedCellId.end(), 0); updatedCellSeed.resize(foundSeeds); + thrust::fill(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed()); + gpu::processNeighboursKernel<<>>( iLayer, level, @@ -1239,14 +1212,12 @@ void processNeighboursHandler(const int startLayer, propagator, matCorrType); } - - // final copy of result - const auto selector = gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)); - const auto count = thrust::count_if(sync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), selector); - thrust::device_vector, gpu::TypedAllocator>> outSeeds(count, allocCellSeed); - thrust::copy_if(sync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), selector); - seedsHost.reserve(seedsHost.size() + count); - thrust::copy(outSeeds.begin(), outSeeds.end(), std::back_inserter(seedsHost)); + GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream)); + thrust::device_vector, gpu::TypedAllocator>> outSeeds(updatedCellSeed.size(), allocCellSeed); + auto end = thrust::copy_if(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); + auto s{end - outSeeds.begin()}; + seedsHost.reserve(seedsHost.size() + s); + thrust::copy(outSeeds.begin(), outSeeds.begin() + s, std::back_inserter(seedsHost)); alloc->popTagOffStack(Tag); }