tensorrt's native int8(best) is slower than fp16 for FasterVit[TensorRT 8.6.1 , Tesla T4]
tp-nan opened this issue · 2 comments
Description
It's slower when running faster_vit_0_224(https://github.com/NVlabs/FasterViT) for int8+fp16 option on GPU Tesla T4, compared to fp16.
The int8 (best) and fp16 performances are very close, at 1.46077ms and 1.36375ms, respectively. Due to the fact that both the last two stages of the network are fused into a single Myelin layer, it is not possible to analyze the timing in detail.
If I want to improve the int8 performance of this network, are there any feasible directions?
TensorRT Version: 8.6.1
machine:Tesla T4
onnx opset: 17
Environment
TensorRT Version 8.6.1:
NVIDIA GPU: Tesla T4
NVIDIA Driver Version: 510
CUDA Version cuda11.6:
Commands or scripts:
int8+fp16:
pip install fastervit onnxsim
python test.py
fp16:
precision=fp16 python test.py
test.py:
import os
def export_onnx(torch_model, onnx_path, input = None):
import torch
if not input is None:
dummy_input=input.cpu()
else:
dummy_input = torch.randn(*(1, 3, 224, 224), device='cpu')
input_names = [ "input" ]
output_names = [ "output" ]
torch_model.eval()
torch_model=torch_model.cpu()
out_size = len(torch_model(dummy_input))
out={"input":{0:"batch_size"}}
for i in range(out_size):
out[f"output_{i}"] = {0:"batch_size"}
torch.onnx.export(torch_model,
dummy_input,
onnx_path,
verbose=False,
opset_version=17,
do_constant_folding=True,
keep_initializers_as_inputs=True,
input_names=["input"], # 输入名
output_names=[f"output_{i}" for i in range(out_size)], # 输出名
dynamic_axes=out
)
SIM=True
if SIM:
import onnx
from onnxsim import onnx_simplifier
onnx_model = onnx.load(onnx_path)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
model_simp, check = onnx_simplifier.simplify(onnx_model, check_n = 0)
onnx.save(model_simp, onnx_path)
print(onnx_path, " saved")
def speed_test(all_models, tmp_dir, precision="fp16"):
assert(isinstance(all_models, dict))
for model_name, m in all_models.items():
m.eval()
onnx_path = os.path.join(tmp_dir, f"{model_name}.onnx")
export_onnx(m, onnx_path)
if precision == "fp16":
os.system("trtexec --onnx={} --fp16".format(onnx_path))
elif precision == "best":
os.system("trtexec --onnx={} --best".format(onnx_path))
if __name__ == "__main__":
tmp_dir = "tmp"
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
precision = os.environ.get("precision", "best")
# fastervit
import fastervit
all_models=["faster_vit_0_224"]
all_models = {x:fastervit.create_model(x, pretrained=False, num_classes=3) for x in all_models}
speed_test(all_models, tmp_dir, precision=precision)
@nvpohanh Is this the layer profile bug we fixed?
This is a known issue and we are working on it. (internal tracker id 3934067)