rohitgeo/singleshotdetector

Inference on onnx/mlmodel

Opened this issue · 1 comments

stev3 commented

Has anyone succeeded in porting this for inference on mobile? I've converted it, but cannot figure out to to transform the bounding box and classification outputs into x,y,w,h.

I thought I might be able to grab the highest confidence in the class I'm looking for and use that row index to find the corresponding bounding box, but the bounding box is not in the typical range (between 0 and 1).

ans = preds['confidence'][0].argmax(axis=0)
coords = preds['coordinates'][0][ans[3]] # ans[3] corresponds to the class I'm looking for
coords_softmax = softmax(coords, axis=0)
print(coords)
[  0.35327148   2.9023438    1.5244141  -10.9296875 ]

Below is my conversion code.

size = (224,224)

class ImageScale(nn.Module):
    def __init__(self):
        super().__init__()
        mean_r = torch.full((1, size[0], size[1]), 0.485, device=torch.device("cuda"))
        std_r = torch.full((1, size[0], size[1]), 0.229, device=torch.device("cuda"))
        mean_g = torch.full((1, size[0], size[1]), 0.456, device=torch.device("cuda"))
        std_g = torch.full((1, size[0], size[1]), 0.224, device=torch.device("cuda"))
        mean_b = torch.full((1, size[0], size[1]), 0.406, device=torch.device("cuda"))
        std_b = torch.full((1, size[0], size[1]), 0.225, device=torch.device("cuda"))
        
        self.denominator = torch.full((1, size[0], size[1]), 255., device=torch.device("cuda"))
        self.means = torch.cat((mean_r, mean_g, mean_b), 0)
        self.stds = torch.cat((std_r, std_g, std_b), 0)

    def forward(self, x): 
        normalized = torch.div(x,self.denominator)
        numerator = torch.sub(normalized,self.means)
        out = torch.div(numerator,self.stds)
        return out.unsqueeze(0)
final_model = [ImageScale()] + [ssd.learn.model]
final_model = nn.Sequential(*final_model)
model_name = "ssd_resnet_5_epochs.onnx"
dummy_input = Variable(torch.randn(3, size[0], size[1])).cuda()
torch.onnx.export(final_model, dummy_input, model_name, input_names = ['image'], output_names=['confidence','coordinates'])
onnx_model = onnx.load(model_name)
mlmodel = convert(onnx_model, image_input_names = ['image'], target_ios='13')
mlmodel.input_description['image'] = 'Image'
mlmodel.output_description['coordinates'] = 'Coordinates'
mlmodel.output_description['confidence'] = 'confidence'
mlmodel.save('ssd_resnet_5_epochs.mlmodel')
stev3 commented

I figured it out.

For anyone looking to run inference on iOS, you must create your anchor and grid sizes and pass them along with your bounding boxes into the _actn_to_bb function. Then deregularize and create bounding boxes like this:

coords = torch.tensor(coords)
    coords_actn_to_bb = _actn_to_bb(coords, anchors, grid_sizes)
    coords_ans = coords_actn_to_bb[ans[2]] # ans[2] is the box with max value of the class
    coords_ans_bb = (coords_ans-0.5) * 2

    x1 = 112 * (1 + coords_ans_bb[1])
    y1 = 112 * (1 + coords_ans_bb[0])
    x2 = 112 * (1 + coords_ans_bb[3])
    y2 = 112 * (1 + coords_ans_bb[2])

    color = [0,255,0]
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness=2)

    coords_ans = coords_actn_to_bb[ans[3]] # ans[3] is the box with max value of the class
    coords_ans_bb = (coords_ans-0.5) * 2

    x1 = 112 * (1 + coords_ans_bb[1])
    y1 = 112 * (1 + coords_ans_bb[0])
    x2 = 112 * (1 + coords_ans_bb[3])
    y2 = 112 * (1 + coords_ans_bb[2])

    color = [255,0,0]
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness=2)