推理时间问题

您好，我在推理时间遇到一些问题。
同一帧query图像，与多张图像进行匹配，耗时相差近10倍，请问是什么原因呢，该如何解决

您好 @guanba666，感谢您对LightGlue-ONNX的兴趣。

请问可以进行print(kpts0.shape, kpts1.shape)吗？

看起来与点的数目关系不大

推理代码如下

class LightGlueRunner:
    def __init__(
        self,
        lightglue_path=None,
    ):
        lightglue_path = "/home/notebook/data/group/dsz/LightGlue-ONNX/weights/superpoint_lightglue_fused_fp16.onnx"
        
        providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
        trt = False
        if trt:
            providers = [
                (
                    "TensorrtExecutionProvider",
                    {
                        "trt_fp16_enable": True,
                        "trt_engine_cache_enable": True,
                        "trt_engine_cache_path": "weights/cache",
                    },
                )
            ] + providers

        # v1.0
        sess_options = ort.SessionOptions()
        self.lightglue = ort.InferenceSession(
            lightglue_path, sess_options=sess_options, providers=providers
        )

    def run(self, data: dict) -> dict:

        kpts0, desc0, size0 = np.array([data['keypoints0']]), np.array([data['descriptors0']]).astype(np.float32), data.get('image_size0')
        kpts1, desc1, size1 = np.array([data['keypoints1']]), np.array([data['descriptors1']]).astype(np.float32), data.get('image_size1')
        b, m, _ = kpts0.shape
        b, n, _ = kpts1.shape

        t0 = time.time()
        matches0, mscores0 = self.lightglue.run(
            None,
            {
                "kpts0": self.normalize_keypoints(
                    kpts0, size0[1], size0[0]
                ),
                "kpts1": self.normalize_keypoints(
                    kpts1, size1[1], size1[0]
                ),
                "desc0": desc0,
                "desc1": desc1,
            },
        )
        t1 = time.time()
        print(f'match use time: {(t1 - t0)*1000}')
        
        m0 = np.full(m, -1)
        ms0 = np.full(m, 0, dtype=float)
        for i in range(len(matches0)):
            m0[matches0[i][0]] = matches0[i][1]
            ms0[matches0[i][0]] = mscores0[i]

        pred = [{
            'matches0': m0,
            'matching_scores0': ms0,
        }]
        return pred

可能是从CPU搬去GPU memory的时间也被包括。
代码请用：

LightGlue-ONNX/eval.py

Lines 177 to 202 in b1007b3

    
           lightglue_inputs = { 
        
               "kpts0": LightGlueRunner.normalize_keypoints( 
        
                   kpts0, image0.shape[2], image0.shape[3] 
        
               ), 
        
               "kpts1": LightGlueRunner.normalize_keypoints( 
        
                   kpts1, image1.shape[2], image1.shape[3] 
        
               ), 
        
               "desc0": desc0, 
        
               "desc1": desc1, 
        
           } 
        
           lightglue_outputs = ["matches0", "mscores0"] 
        
           if device == "cuda": 
        
               # Prepare IO-Bindings 
        
               binding = lightglue.io_binding() 
        
               for name, arr in lightglue_inputs.items(): 
        
                   binding.bind_cpu_input(name, arr) 
        
               for name in lightglue_outputs: 
        
                   binding.bind_output(name, "cuda") 
        
               # Measure only matching time 
        
               start = time.perf_counter() 
        
               result = lightglue.run_with_iobinding(binding) 
        
               end = time.perf_counter()

eval.py仍然会出现耗时不稳定的问题，主要是onnxruntime内部资源调度的问题，修改为如下代码即可

sess_options = ort.SessionOptions()
        sess_options.intra_op_num_threads=1
        self.lightglue = ort.InferenceSession(
            lightglue_path, sess_options=sess_options, providers=providers,
        )

	lightglue_inputs = {
	"kpts0": LightGlueRunner.normalize_keypoints(
	kpts0, image0.shape[2], image0.shape[3]
	),
	"kpts1": LightGlueRunner.normalize_keypoints(
	kpts1, image1.shape[2], image1.shape[3]
	),
	"desc0": desc0,
	"desc1": desc1,
	}
	lightglue_outputs = ["matches0", "mscores0"]

	if device == "cuda":
	# Prepare IO-Bindings
	binding = lightglue.io_binding()

	for name, arr in lightglue_inputs.items():
	binding.bind_cpu_input(name, arr)

	for name in lightglue_outputs:
	binding.bind_output(name, "cuda")

	# Measure only matching time
	start = time.perf_counter()
	result = lightglue.run_with_iobinding(binding)
	end = time.perf_counter()