fabio-sim/LightGlue-ONNX

推理时间问题

Closed this issue · 5 comments

您好,我在推理时间遇到一些问题。
同一帧query图像,与多张图像进行匹配,耗时相差近10倍,请问是什么原因呢,该如何解决
1
2

您好 @guanba666,感谢您对LightGlue-ONNX的兴趣。

请问可以进行print(kpts0.shape, kpts1.shape)吗?

看起来与点的数目关系不大
3

推理代码如下

class LightGlueRunner:
    def __init__(
        self,
        lightglue_path=None,
    ):
        lightglue_path = "/home/notebook/data/group/dsz/LightGlue-ONNX/weights/superpoint_lightglue_fused_fp16.onnx"
        
        providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
        trt = False
        if trt:
            providers = [
                (
                    "TensorrtExecutionProvider",
                    {
                        "trt_fp16_enable": True,
                        "trt_engine_cache_enable": True,
                        "trt_engine_cache_path": "weights/cache",
                    },
                )
            ] + providers

        # v1.0
        sess_options = ort.SessionOptions()
        self.lightglue = ort.InferenceSession(
            lightglue_path, sess_options=sess_options, providers=providers
        )

    def run(self, data: dict) -> dict:

        kpts0, desc0, size0 = np.array([data['keypoints0']]), np.array([data['descriptors0']]).astype(np.float32), data.get('image_size0')
        kpts1, desc1, size1 = np.array([data['keypoints1']]), np.array([data['descriptors1']]).astype(np.float32), data.get('image_size1')
        b, m, _ = kpts0.shape
        b, n, _ = kpts1.shape

        t0 = time.time()
        matches0, mscores0 = self.lightglue.run(
            None,
            {
                "kpts0": self.normalize_keypoints(
                    kpts0, size0[1], size0[0]
                ),
                "kpts1": self.normalize_keypoints(
                    kpts1, size1[1], size1[0]
                ),
                "desc0": desc0,
                "desc1": desc1,
            },
        )
        t1 = time.time()
        print(f'match use time: {(t1 - t0)*1000}')
        
        m0 = np.full(m, -1)
        ms0 = np.full(m, 0, dtype=float)
        for i in range(len(matches0)):
            m0[matches0[i][0]] = matches0[i][1]
            ms0[matches0[i][0]] = mscores0[i]

        pred = [{
            'matches0': m0,
            'matching_scores0': ms0,
        }]
        return pred

可能是从CPU搬去GPU memory的时间也被包括。
代码请用:

LightGlue-ONNX/eval.py

Lines 177 to 202 in b1007b3

lightglue_inputs = {
"kpts0": LightGlueRunner.normalize_keypoints(
kpts0, image0.shape[2], image0.shape[3]
),
"kpts1": LightGlueRunner.normalize_keypoints(
kpts1, image1.shape[2], image1.shape[3]
),
"desc0": desc0,
"desc1": desc1,
}
lightglue_outputs = ["matches0", "mscores0"]
if device == "cuda":
# Prepare IO-Bindings
binding = lightglue.io_binding()
for name, arr in lightglue_inputs.items():
binding.bind_cpu_input(name, arr)
for name in lightglue_outputs:
binding.bind_output(name, "cuda")
# Measure only matching time
start = time.perf_counter()
result = lightglue.run_with_iobinding(binding)
end = time.perf_counter()

eval.py仍然会出现耗时不稳定的问题,主要是onnxruntime内部资源调度的问题,修改为如下代码即可

sess_options = ort.SessionOptions()
        sess_options.intra_op_num_threads=1
        self.lightglue = ort.InferenceSession(
            lightglue_path, sess_options=sess_options, providers=providers,
        )