YOLOv3 Human Detection

There is an object detection algorithm called YOLO (You Only Live Once). In this case, the entire image is processed by a single neural network. Image regions are divided up by this network. It then forecast boxes and probabilities for each region. A probability distribution is applied to these bounding boxes in order to determine their weighting.

Firstly, we import the essential libraries.

import cv2 #for image processing
import numpy as np #for numeric calculations
# from google.colab.patches import cv2_imshow

Then, we import the datasets. Download the full dataset archive from here. Also, we store the final or output layer of the architecture because we are concerned with the outputs.

net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
layer_names = net.getLayerNames()
output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
with open("coco.names", "r") as f: classes = [line.strip() for line in f.readlines()]
colors = np.random.uniform(0, 255, size=(len(classes), 3))

Now, provide an input video file. This project's sample video can be downloaded from here.
```
cap = cv2.VideoCapture("sample.mp4")
```
Set the label font for YOLO detction.
```
font = cv2.FONT_HERSHEY_PLAIN
```
Set the time, using the time library.
```
import time
starting_time = time.time()
```

Set the output file configurations.

result = cv2.VideoWriter('output.mp4', cv2.VideoWriter_fourcc(*'MJPG'),25, (640, 352))

The loop starts to ork on the input video, frame by frame.

frame_id = 0
while True: #infinite loop
    _, frame = cap.read() #reading frame by frame
    frame_id += 1 #interator to calculate the frames
    height, width, channels = frame.shape #configuration of each frame
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False) #416x416 resizing + channel reorder
    net.setInput(blob) #fed to YOLO network
    outs = net.forward(output_layers) #output from YOLO network, detected images
    class_ids = [] #coco.name object
    confidences = [] #detection accuracy
    boxes = [] #boxes around a person
    for out in outs: #object detection
        for detection in out: #detection = outs[i]
            scores = detection[5:]
						#The actual confidence intervals are a percentage value between 0 and 1 for all 80 classes.
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.2:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[3] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 1.8)
                y = int(center_y - h / 1.8)
                boxes.append([x, y, w, h]) #setting calculated vertices
                confidences.append(float(confidence))
                class_ids.append(class_id)
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.4, 0.3) #remove overlapping boxes
    for i in range(len(boxes)): #drawing the box
        if i in indexes:
            x, y, w, h = boxes[i] #getting vertices for the box
            label = str(classes[class_ids[i]]) #getting detected labels
            confidence = confidences[i] #getting detection percents
            color = colors[class_ids[i]]
            cv2.rectangle(frame, (x+10, y+10), (x + w, y + h), color, 2) #setting the box
            cv2.putText(frame, label + " " + str(round(confidence, 2)), (x, y + 30), font, 2, color, 2) #editing the box into the frame(s) image
    elapsed_time = time.time() - starting_time #calculating the time passed
    fps = frame_id / elapsed_time #calculating the preset frame
    cv2.putText(frame, "FPS: " + str(round(fps, 2)), (10, 50), font, 2, (0, 0, 0), 3) #editing the frame number on the image(s)
    result.write(frame) #rendering the frame(s) into the output video 
    # cv2_imshow(frame)
    if cv2.waitKey(1) ==ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

suvrashaw/YOLO-Flask

YOLOv3 Human Detection