Low performance comparing to NVIDIA's official 3d software stack in jetson.
ZhenshengLee opened this issue · 5 comments
Recently I tested cuda-pcl which only provides api for jetson.
The test in my jetson xavier shows that the performance of pass through filter in cupoch is nearly only a half of which with cuda-pcl
below is my test screenshot. The test resource is from cuda-pcl
When using cuda-pcl-filter, it only costs 0.586943ms to filter 119978 points to 5110 points.
But when using cupoch, it costs
I will try to find solution inside cupoch and try not to import 3rd party codes or use lower api from cuda.
Any Suggestions to improve the performance so that it can make full usage of jetson gpu? @neka-nat
Thanks!
[ ]managed memory allocation
The cupoch code is below, which is inspired by main.cpp from cuda-pcl
#include <fstream>
#include <chrono>
// #include <pcl/io/pcd_io.h>
// #include <pcl/filters/passthrough.h>
// #include <pcl/filters/voxel_grid.h>
// #include "cuda_runtime.h"
// #include "lib/cudaFilter.h"
#include <cupoch/cupoch.h>
#include <gflags/gflags.h>
// DEFINE_string(target_cloud, "./res/pcd/ndt/room_scan1.pcd", "Filename of target_cloud for ndt alignment");
DEFINE_string(input_cloud, "./res/pcd/sample.pcd", "Filename of source_cloud for cuda-filter");
using namespace cupoch;
void Getinfo(void)
{
cudaDeviceProp prop;
int count = 0;
cudaGetDeviceCount(&count);
printf("\nGPU has cuda devices: %d\n", count);
for (int i = 0; i < count; ++i)
{
cudaGetDeviceProperties(&prop, i);
printf("----device id: %d info----\n", i);
printf(" GPU : %s \n", prop.name);
printf(" Capbility: %d.%d\n", prop.major, prop.minor);
printf(" Global memory: %luMB\n", prop.totalGlobalMem >> 20);
printf(" Const memory: %luKB\n", prop.totalConstMem >> 10);
printf(" SM in a block: %luKB\n", prop.sharedMemPerBlock >> 10);
printf(" warp size: %d\n", prop.warpSize);
printf(" threads in a block: %d\n", prop.maxThreadsPerBlock);
printf(" block dim: (%d,%d,%d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf(" grid dim: (%d,%d,%d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
}
printf("\n");
}
void testCupoch(std::shared_ptr<geometry::PointCloud>& cloudSrc, std::shared_ptr<geometry::PointCloud>& cloudDst)
{
std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
std::chrono::duration<double, std::ratio<1, 1000>> time_span =
std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
std::cout << "\n------------checking CUDA ---------------- " << std::endl;
std::cout << "CUDA Loaded " << cloudSrc->points_.size() << std::endl;
{
unsigned int nCount = cloudSrc->points_.size();
unsigned int countLeft = 0;
std::cout << "\n------------checking CUDA PassThrough ---------------- " << std::endl;
// setP.type = type;
// setP.dim = 0;
// setP.upFilterLimits = 0.5;
// setP.downFilterLimits = -0.5;
// setP.limitsNegative = false;
// filterTest.set(setP);
t1 = std::chrono::steady_clock::now();
// filterTest.filter(output, &countLeft, input, nCount);
cloudDst = cloudSrc->PassThroughFilter(0, -0.5, 0.5);
// cloudSrc->PassThroughFilter(1, m_min_height_low, m_max_height_low);
// cloudSrc->PassThroughFilter(2, m_min_distance_low, m_max_distance_low);
countLeft = cloudDst->points_.size();
t2 = std::chrono::steady_clock::now();
time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
std::cout << "CUDA PassThrough by Time: " << time_span.count() << " ms." << std::endl;
std::cout << "CUDA PassThrough before filtering: " << nCount << std::endl;
std::cout << "CUDA PassThrough after filtering: " << countLeft << std::endl;
}
// {
// unsigned int countLeft = 0;
// std::cout << "\n------------checking CUDA VoxelGrid---------------- " << std::endl;
// memset(outputData, 0, sizeof(float) * 4 * nCount);
// type = VOXELGRID;
// setP.type = type;
// setP.voxelX = 1;
// setP.voxelY = 1;
// setP.voxelZ = 1;
// filterTest.set(setP);
// int status = 0;
// cudaDeviceSynchronize();
// t1 = std::chrono::steady_clock::now();
// status = filterTest.filter(output, &countLeft, input, nCount);
// cudaDeviceSynchronize();
// t2 = std::chrono::steady_clock::now();
// if (status != 0)
// return;
// time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// std::cout << "CUDA VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
// std::cout << "CUDA VoxelGrid before filtering: " << nCount << std::endl;
// std::cout << "CUDA VoxelGrid after filtering: " << countLeft << std::endl;
// pcl::PointCloud<pcl::PointXYZ>::Ptr cloudNew(new pcl::PointCloud<pcl::PointXYZ>);
// cloudNew->width = countLeft;
// cloudNew->height = 1;
// cloudNew->points.resize(cloudNew->width * cloudNew->height);
// int check = 0;
// for (std::size_t i = 0; i < cloudNew->size(); ++i)
// {
// cloudNew->points[i].x = output[i * 4 + 0];
// cloudNew->points[i].y = output[i * 4 + 1];
// cloudNew->points[i].z = output[i * 4 + 2];
// }
// pcl::io::savePCDFileASCII("after-cuda-VoxelGrid.pcd", *cloudNew);
// }
}
// void testPCL(pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc, pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst)
// {
// std::cout << "\n\n------------checking PCL ---------------- " << std::endl;
// std::cout << "PCL(CPU) Loaded " << cloudSrc->width * cloudSrc->height
// << " data points from PCD file with the following fields: " << pcl::getFieldsList(*cloudSrc) << std::endl;
// std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
// std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
// std::chrono::duration<double, std::ratio<1, 1000>> time_span =
// std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// int nCount = cloudSrc->width * cloudSrc->height;
// float* outputData = (float*)cloudDst->points.data();
// {
// std::cout << "\n------------checking PCL(CPU) PassThrough ---------------- " << std::endl;
// memset(outputData, 0, sizeof(float) * 4 * nCount);
// // Create the filtering object
// pcl::PassThrough<pcl::PointXYZ> pass;
// pass.setInputCloud(cloudSrc);
// pass.setFilterFieldName("x");
// pass.setFilterLimits(-0.5, 0.5);
// pass.setFilterLimitsNegative(false);
// t1 = std::chrono::steady_clock::now();
// pass.filter(*cloudDst);
// t2 = std::chrono::steady_clock::now();
// time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// std::cout << "PCL(CPU) PassThrough by Time: " << time_span.count() << " ms." << std::endl;
// std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
// << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
// std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
// << pcl::getFieldsList(*cloudDst) << ")." << std::endl;
// pcl::io::savePCDFileASCII("after-pcl-PassThrough.pcd", *cloudDst);
// }
// {
// std::cout << "\n------------checking PCL VoxelGrid---------------- " << std::endl;
// memset(outputData, 0, sizeof(float) * 4 * nCount);
// t1 = std::chrono::steady_clock::now();
// // Create the filtering object
// pcl::VoxelGrid<pcl::PointXYZ> sor;
// sor.setInputCloud(cloudSrc);
// sor.setLeafSize(1, 1, 1);
// sor.filter(*cloudDst);
// t2 = std::chrono::steady_clock::now();
// time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// std::cout << "PCL VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
// std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
// << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
// std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
// << pcl::getFieldsList(*cloudDst) << ")." << std::endl;
// pcl::io::savePCDFileASCII("after-pcl-VoxelGrid.pcd", *cloudDst);
// }
// }
int main(int argc, const char** argv)
{
std::string file = "./sample.pcd";
if (argc > 1)
file = (argv[1]);
Getinfo();
// pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc(new pcl::PointCloud<pcl::PointXYZ>);
// pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst(new pcl::PointCloud<pcl::PointXYZ>);
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};
// if (pcl::io::loadPCDFile<pcl::PointXYZ>(file.c_str(), *cloudSrc) == -1)
// {
// std::cout << "Error:can not open the file: " << file.c_str() << std::endl;
// return (-1);
// }
utility::InitializeAllocator();
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
// Loading second scan of room from new perspective.
input_cloud = io::CreatePointCloudFromFile(FLAGS_input_cloud);
std::cout << "Loaded " << input_cloud->points_.size() << " data points from " << FLAGS_input_cloud << std::endl;
testCupoch(input_cloud, output_cloud);
// testPCL(cloudSrc, cloudDst);
return 0;
}
Hi,
Thank you for reporting!
I've modified the pool allocator to work better.
Using the latest master and adding the following settings makes it about 30% faster.
InitializeAllocator(PoolAllocation, 1000000000);
Thank you very much!
- x86 pc with qudro-p4000 gpu
PoolAllocation
strategy- cuda-filter-cupoch context
- filter 119978 points to 5510.
The passfilter function costs 0.096551ms
The InitializeAllocator
calling position does affect the behavior of pointcloud function
// speed up around 30% as you said
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
//...
// changes back to the normal spped
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};
I know it is about allocation order of pointcloud, but it is abnormal.