how to inference a video ?
IronmanVsThanos opened this issue · 1 comments
how to inference a video ?thank U ,my hero
import argparse
import random
import time
import numpy as np
from PIL import Image
import os
import torchvision
from torchvision.ops.boxes import batched_nms
import cv2
import os
import torch
import numpy as np
import cv2
import time
from main import build_model_main
from util.slconfig import SLConfig
from datasets import build_dataset
from util.visualizer import COCOVisualizer
from util import box_ops
from PIL import Image
import datasets.transforms as T
t_total = 0.0
def get_args_parser():
parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
parser.add_argument('--weights', default="/mnt/sda1/Deep_learning/code/DINO-main/logs/DINO/R50-4S-coco_city/checkpoint_best_regular.pth", type=str)
parser.add_argument('--input_video', default="/mnt/sda1/Deep_learning/code/DINO-main/images/daolu1.avi", type=str)
parser.add_argument('--output_dir', default="/mnt/sda1/Deep_learning/code/DINO-main/images/output/inference_result.mp4", type=str)
parser.add_argument('--model_config_path', default="config/DINO/DINO_4scale.py", type=str)
parser.add_argument('--device', default="cuda", type=str)
return parser
def box_cxcywh_to_xyxy(x):
# unbind 表示沿着某个维度拆开输入x ,拆为100, x的维度为100,4
x_c, y_c, w, h = x.unbind(1)
# b list 里面有四个维度为100,的tensor 分别代表左上角x 左上角y 右下角x 右下角y
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=1)
def rescale_bboxes(out_bbox, size):
img_w, img_h = size
# b为100,4 的tensor 但已经转换为左上右下格式。
b = box_cxcywh_to_xyxy(out_bbox)
# 坐标tensor为0~1,根据size回复成相对于输入网络大小的box左上右下格式信息。
b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
return b
def filter_boxes(scores, boxes, confidence=0.7, apply_nms=True, iou=0.5):
# boxes 100,4 scores 100,90 scores.max(-1)表示在score的最后一个维度 取该维度的最大值和最大值所在的索引。
# keep 返回的是有个有目标的 tensor 用于标定 100个目标中哪个有目标,取出对应的分数和类别
keep = scores.max(-1).values > confidence
scores, boxes = scores[keep], boxes[keep]
if apply_nms:
top_scores, labels = scores.max(-1)
keep = batched_nms(boxes, top_scores, labels, iou)
scores, boxes = scores[keep], boxes[keep]
return scores, boxes
COCO classes
CLASSES = ['N/A',"car",
"coach",
"bus",
"truck",
"tricycle",
"person",
"twowheelsvehicle",
"taxi",
"license_plate",
"other_vehicles"]
def plot_one_box(x, img, color=None, label=None, line_thickness=2):
tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
def process_video(args,):
print(args)
device = torch.device(args.device)
model_args = SLConfig.fromfile(args.model_config_path)
model_args.device = 'cuda'
model, criterion, postprocessors = build_model_main(model_args)
# model:所构建的模型 criterion:损失函数相关的 postprocessors:后处理
print(model)
checkpoint = torch.load(args.weights, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("parameters:", n_parameters)
cap = cv2.VideoCapture(args.input_video)
width = int(cap.get(3))
height = int(cap.get(4))
fps = int(cap.get(5))
out = cv2.VideoWriter(args.output_dir, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
image_totensor = torchvision.transforms.ToTensor()
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
image_tensor = image_totensor(image)
image_tensor = torch.reshape(image_tensor,
[-1, image_tensor.shape[0], image_tensor.shape[1], image_tensor.shape[2]])
# 1, 3, 1080, 810
image_tensor = image_tensor.to(device)
time1 = time.time()
# pred_logits 1 100 92 pred_boxes 1 100 4
inference_result = model.cuda()(image_tensor)
time2 = time.time()
t_perFrame = time2 - time1
print("inference_time:", t_perFrame)
global t_total
t_total += t_perFrame
# 沿着最后一个维度softmax 维度不变,后面选择了第一个元素(第一个子数组),并且选择了该子数组的所有行和除了最后一列的所有列 最后一列是背景
# probas 100,91
probas = inference_result['pred_logits'].softmax(-1)[0, :, :-1].cpu()
# inference_result['pred_boxes'][0,]输出维度为 100 4 后面进行cxcywh to xyxy 并进行尺寸缩放
bboxes_scaled = rescale_bboxes(inference_result['pred_boxes'][0,].cpu(),
(image_tensor.shape[3], image_tensor.shape[2]))
# 筛选出可能的框数目 boxes 7,4 以及对应类别分数 scores 7, 91
scores, boxes = filter_boxes(probas, bboxes_scaled)
scores = scores.data.numpy()
boxes = boxes.data.numpy()
for i in range(boxes.shape[0]):
# 循环处理图像中的每个box argmax() 方法返回数组或张量中最大元素的索引,即最高分数的位置。
class_id = scores[i].argmax()
# 根据id拿到类别
label = CLASSES[class_id]
# max() 方法被调用来找到这个分数的最大值,即最高的置信度。
confidence = scores[i].max()
text = f"{label} {confidence:.3f}"
print(text)
image = np.array(image)
plot_one_box(boxes[i], image, label=text)
frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
out.write(frame)
cap.release()
out.release()
print("处理完成!用时:", t_total)
if name == 'main':
parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
args = parser.parse_args()
process_video(args)