tensorrt's native int8(best) is slower than fp16 for FasterVit[TensorRT 8.6.1 , Tesla T4]

Question

tensorrt's native int8(best) is slower than fp16 for FasterVit[TensorRT 8.6.1 , Tesla T4]

tp-nan opened this issue a year ago · 2 comments

Description

It's slower when running faster_vit_0_224(https://github.com/NVlabs/FasterViT) for int8+fp16 option on GPU Tesla T4, compared to fp16.

The int8 (best) and fp16 performances are very close, at 1.46077ms and 1.36375ms, respectively. Due to the fact that both the last two stages of the network are fused into a single Myelin layer, it is not possible to analyze the timing in detail.

If I want to improve the int8 performance of this network, are there any feasible directions?

TensorRT Version： 8.6.1
machine：Tesla T4
onnx opset: 17

Environment

TensorRT Version 8.6.1:

NVIDIA GPU: Tesla T4

NVIDIA Driver Version: 510

CUDA Version cuda11.6:

Commands or scripts:
int8+fp16:

pip install fastervit onnxsim
python test.py

fp16:

precision=fp16 python test.py

test.py:

import os


def export_onnx(torch_model, onnx_path, input = None):      
    import torch 
    if not input is None:
        dummy_input=input.cpu()
    else:
        dummy_input = torch.randn(*(1, 3, 224, 224), device='cpu')

    input_names = [ "input" ]
    output_names = [ "output" ]

    torch_model.eval()
    torch_model=torch_model.cpu()

    out_size = len(torch_model(dummy_input))
    out={"input":{0:"batch_size"}}
    for i in range(out_size):
        out[f"output_{i}"] = {0:"batch_size"}

    torch.onnx.export(torch_model,
                    dummy_input,
                    onnx_path,
                    verbose=False, 
                    opset_version=17,
                    do_constant_folding=True,  
                    keep_initializers_as_inputs=True, 
                    input_names=["input"],      # 输入名
                    output_names=[f"output_{i}" for i in range(out_size)],  # 输出名
                    dynamic_axes=out
                    )
    SIM=True
    if SIM:
        import onnx
        from onnxsim import onnx_simplifier

        onnx_model = onnx.load(onnx_path)
        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
        model_simp, check = onnx_simplifier.simplify(onnx_model, check_n = 0)
        onnx.save(model_simp, onnx_path)
    print(onnx_path, " saved")
  



def speed_test(all_models, tmp_dir, precision="fp16"):
    assert(isinstance(all_models, dict))
    
    for model_name, m in all_models.items():
        m.eval()
        onnx_path = os.path.join(tmp_dir, f"{model_name}.onnx")
        export_onnx(m, onnx_path)

        if precision == "fp16":
            os.system("trtexec --onnx={} --fp16".format(onnx_path))
        elif precision == "best":
            os.system("trtexec --onnx={} --best".format(onnx_path))
        
if __name__ == "__main__":
    tmp_dir = "tmp"
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    
    precision = os.environ.get("precision", "best")
  
    # fastervit
    import fastervit
    all_models=["faster_vit_0_224"]
    all_models = {x:fastervit.create_model(x, pretrained=False, num_classes=3) for x in all_models}
    speed_test(all_models, tmp_dir, precision=precision)

Answer 1 · 2023-08-08T14:09:02.000Z

@nvpohanh Is this the layer profile bug we fixed?

Answer 2 · 2023-08-09T04:14:04.000Z

This is a known issue and we are working on it. (internal tracker id 3934067)