Low performance comparing to NVIDIA's official 3d software stack in jetson.

Question

Low performance comparing to NVIDIA's official 3d software stack in jetson.

ZhenshengLee opened this issue 3 years ago · 5 comments

Recently I tested cuda-pcl which only provides api for jetson.

The test in my jetson xavier shows that the performance of pass through filter in cupoch is nearly only a half of which with cuda-pcl

below is my test screenshot. The test resource is from cuda-pcl

When using cuda-pcl-filter, it only costs 0.586943ms to filter 119978 points to 5110 points.

But when using cupoch, it costs

I will try to find solution inside cupoch and try not to import 3rd party codes or use lower api from cuda.

Any Suggestions to improve the performance so that it can make full usage of jetson gpu? @neka-nat

Thanks!

[ ]managed memory allocation

Answer 1 · 2021-04-13T01:21:21.000Z

The cupoch code is below, which is inspired by main.cpp from cuda-pcl

#include <fstream>
#include <chrono>

// #include <pcl/io/pcd_io.h>
// #include <pcl/filters/passthrough.h>
// #include <pcl/filters/voxel_grid.h>

// #include "cuda_runtime.h"
// #include "lib/cudaFilter.h"

#include <cupoch/cupoch.h>
#include <gflags/gflags.h>

// DEFINE_string(target_cloud, "./res/pcd/ndt/room_scan1.pcd", "Filename of target_cloud for ndt alignment");
DEFINE_string(input_cloud, "./res/pcd/sample.pcd", "Filename of source_cloud for cuda-filter");

using namespace cupoch;

void Getinfo(void)
{
    cudaDeviceProp prop;

    int count = 0;
    cudaGetDeviceCount(&count);
    printf("\nGPU has cuda devices: %d\n", count);
    for (int i = 0; i < count; ++i)
    {
        cudaGetDeviceProperties(&prop, i);
        printf("----device id: %d info----\n", i);
        printf("  GPU : %s \n", prop.name);
        printf("  Capbility: %d.%d\n", prop.major, prop.minor);
        printf("  Global memory: %luMB\n", prop.totalGlobalMem >> 20);
        printf("  Const memory: %luKB\n", prop.totalConstMem >> 10);
        printf("  SM in a block: %luKB\n", prop.sharedMemPerBlock >> 10);
        printf("  warp size: %d\n", prop.warpSize);
        printf("  threads in a block: %d\n", prop.maxThreadsPerBlock);
        printf("  block dim: (%d,%d,%d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("  grid dim: (%d,%d,%d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
    }
    printf("\n");
}

void testCupoch(std::shared_ptr<geometry::PointCloud>& cloudSrc, std::shared_ptr<geometry::PointCloud>& cloudDst)
{
    std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
    std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::ratio<1, 1000>> time_span =
        std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);

    std::cout << "\n------------checking CUDA ---------------- " << std::endl;
    std::cout << "CUDA Loaded " << cloudSrc->points_.size() << std::endl;

    {
        unsigned int nCount = cloudSrc->points_.size();
        unsigned int countLeft = 0;

        std::cout << "\n------------checking CUDA PassThrough ---------------- " << std::endl;

        // setP.type = type;
        // setP.dim = 0;
        // setP.upFilterLimits = 0.5;
        // setP.downFilterLimits = -0.5;
        // setP.limitsNegative = false;
        // filterTest.set(setP);

        t1 = std::chrono::steady_clock::now();
        // filterTest.filter(output, &countLeft, input, nCount);
        cloudDst = cloudSrc->PassThroughFilter(0, -0.5, 0.5);
        // cloudSrc->PassThroughFilter(1, m_min_height_low, m_max_height_low);
        // cloudSrc->PassThroughFilter(2, m_min_distance_low, m_max_distance_low);
        countLeft = cloudDst->points_.size();

        t2 = std::chrono::steady_clock::now();
        time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
        std::cout << "CUDA PassThrough by Time: " << time_span.count() << " ms." << std::endl;
        std::cout << "CUDA PassThrough before filtering: " << nCount << std::endl;
        std::cout << "CUDA PassThrough after filtering: " << countLeft << std::endl;
    }

    // {
    //     unsigned int countLeft = 0;
    //     std::cout << "\n------------checking CUDA VoxelGrid---------------- " << std::endl;

    //     memset(outputData, 0, sizeof(float) * 4 * nCount);

    //     type = VOXELGRID;

    //     setP.type = type;
    //     setP.voxelX = 1;
    //     setP.voxelY = 1;
    //     setP.voxelZ = 1;

    //     filterTest.set(setP);
    //     int status = 0;
    //     cudaDeviceSynchronize();
    //     t1 = std::chrono::steady_clock::now();
    //     status = filterTest.filter(output, &countLeft, input, nCount);
    //     cudaDeviceSynchronize();
    //     t2 = std::chrono::steady_clock::now();

    //     if (status != 0)
    //         return;
    //     time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
    //     std::cout << "CUDA VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
    //     std::cout << "CUDA VoxelGrid before filtering: " << nCount << std::endl;
    //     std::cout << "CUDA VoxelGrid after filtering: " << countLeft << std::endl;

    //     pcl::PointCloud<pcl::PointXYZ>::Ptr cloudNew(new pcl::PointCloud<pcl::PointXYZ>);
    //     cloudNew->width = countLeft;
    //     cloudNew->height = 1;
    //     cloudNew->points.resize(cloudNew->width * cloudNew->height);

    //     int check = 0;
    //     for (std::size_t i = 0; i < cloudNew->size(); ++i)
    //     {
    //         cloudNew->points[i].x = output[i * 4 + 0];
    //         cloudNew->points[i].y = output[i * 4 + 1];
    //         cloudNew->points[i].z = output[i * 4 + 2];
    //     }
    //     pcl::io::savePCDFileASCII("after-cuda-VoxelGrid.pcd", *cloudNew);
    // }


}

// void testPCL(pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc, pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst)
// {
//     std::cout << "\n\n------------checking PCL ---------------- " << std::endl;
//     std::cout << "PCL(CPU) Loaded " << cloudSrc->width * cloudSrc->height
//               << " data points from PCD file with the following fields: " << pcl::getFieldsList(*cloudSrc) << std::endl;

//     std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
//     std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
//     std::chrono::duration<double, std::ratio<1, 1000>> time_span =
//         std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);

//     int nCount = cloudSrc->width * cloudSrc->height;
//     float* outputData = (float*)cloudDst->points.data();
//     {
//         std::cout << "\n------------checking PCL(CPU) PassThrough ---------------- " << std::endl;

//         memset(outputData, 0, sizeof(float) * 4 * nCount);

//         // Create the filtering object
//         pcl::PassThrough<pcl::PointXYZ> pass;
//         pass.setInputCloud(cloudSrc);
//         pass.setFilterFieldName("x");
//         pass.setFilterLimits(-0.5, 0.5);
//         pass.setFilterLimitsNegative(false);

//         t1 = std::chrono::steady_clock::now();
//         pass.filter(*cloudDst);
//         t2 = std::chrono::steady_clock::now();

//         time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
//         std::cout << "PCL(CPU) PassThrough by Time: " << time_span.count() << " ms." << std::endl;

//         std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
//                   << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
//         std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
//                   << pcl::getFieldsList(*cloudDst) << ")." << std::endl;
//         pcl::io::savePCDFileASCII("after-pcl-PassThrough.pcd", *cloudDst);
//     }
//     {
//         std::cout << "\n------------checking PCL VoxelGrid---------------- " << std::endl;

//         memset(outputData, 0, sizeof(float) * 4 * nCount);

//         t1 = std::chrono::steady_clock::now();

//         // Create the filtering object
//         pcl::VoxelGrid<pcl::PointXYZ> sor;
//         sor.setInputCloud(cloudSrc);
//         sor.setLeafSize(1, 1, 1);
//         sor.filter(*cloudDst);

//         t2 = std::chrono::steady_clock::now();
//         time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
//         std::cout << "PCL VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
//         std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
//                   << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
//         std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
//                   << pcl::getFieldsList(*cloudDst) << ")." << std::endl;

//         pcl::io::savePCDFileASCII("after-pcl-VoxelGrid.pcd", *cloudDst);
//     }
// }

int main(int argc, const char** argv)
{
    std::string file = "./sample.pcd";
    if (argc > 1)
        file = (argv[1]);

    Getinfo();

    // pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc(new pcl::PointCloud<pcl::PointXYZ>);
    // pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst(new pcl::PointCloud<pcl::PointXYZ>);
    std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
    std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};

    // if (pcl::io::loadPCDFile<pcl::PointXYZ>(file.c_str(), *cloudSrc) == -1)
    // {
    //     std::cout << "Error:can not open the file: " << file.c_str() << std::endl;
    //     return (-1);
    // }
    utility::InitializeAllocator();
    utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);

    // Loading second scan of room from new perspective.
    input_cloud = io::CreatePointCloudFromFile(FLAGS_input_cloud);
    std::cout << "Loaded " << input_cloud->points_.size() << " data points from " << FLAGS_input_cloud << std::endl;

    testCupoch(input_cloud, output_cloud);
    // testPCL(cloudSrc, cloudDst);

    return 0;
}

Answer 2 · 2021-04-13T04:51:03.000Z

Hi,
Thank you for reporting!
I've modified the pool allocator to work better.
Using the latest master and adding the following settings makes it about 30% faster.

InitializeAllocator(PoolAllocation, 1000000000);

Answer 3 · 2021-04-13T08:45:12.000Z

Thank you very much!

Answer 4 · 2021-04-21T02:37:43.000Z

x86 pc with qudro-p4000 gpu
PoolAllocation strategy
cuda-filter-cupoch context
filter 119978 points to 5510.

The passfilter function costs 0.096551ms

Answer 5 · 2021-04-21T03:04:09.000Z

The InitializeAllocator calling position does affect the behavior of pointcloud function

// speed up around 30% as you said
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
//...

// changes back to the normal spped
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};

I know it is about allocation order of pointcloud, but it is abnormal.