In the previous tutorial series, we explored how to run AI object detection models on the UNIHIKER using the official code. We also conducted basic tests and comparisons on common models.
In the article YOLO and MediaPipe Image Object Detection Testing on UNIHIKER, we evaluated the performance of these models in detecting objects in images.
In this article, we will:
Note: Because video object detection and image object detection use the same model, there will be no difference in detection accuracy, so the accuracy of the detection results can be directly referred to the test article of YOLO and Mediapipe image object detection on the UNIHIKER.
The official code for exporting onnx models is:
from ultralytics import YOLO
# Load a pretrained YOLOv10n model
model = YOLO("yolov10n.pt")
#export onnx
model.export(format='onnx')
When using the official code to export the YOLO series model to the onnx format, there are several different options:
We will conduct comparative tests on different combinations of these options, all with a resolution of 640. The following models are included:
The models with different settings were tested for video object detection using a free space board + USB camera. The results are as follows:
Through the statistics of test results, the following characteristics can be analyzed:
1. If you know the size of the input image in advance, do not use any parameter settings, just use imgsz to set the image size;
2. If the implementation does not know the size of the image input, use dynamic=True;
3. Do not set half=True and simplify=True
Download here: https://drive.google.com/file/d/1NLO-UYVeE9E0z6mXimCxWkDdNOcTCXzc/view?usp=sharing
It can be seen that as the input size decreases, the frame rate increases significantly.
The frame rate of yolov10n is slightly higher than that of yolov8n by about 10%, while the model size is reduced by 25%.
For the three models in Mediapipe, we tested different resolutions under non-quantization and int8 quantization.
Resolutions include:
640, 448, 320, 256, 128
We found the following characteristics after statistical testing:
The code for object detection using yolov10n with UNIHIKER + USB camera is as follows:
import cv2
import numpy as np
import onnxruntime as ort
import yaml
import time
def preprocess(frame, input_size):
image = cv2.resize(frame, input_size,interpolation=cv2.INTER_NEAREST)
image_data = np.array(image).transpose(2, 0, 1)
image_data = image_data.astype(np.float32)
image_data /= 255.0
image_data = np.expand_dims(image_data, axis=0)
return image_data
def postprocess(output, image, input_size, show_size, classes):
for detection in output:
x1, y1, x2, y2, conf , class_id = detection
if conf > 0.4:
x1 = int(x1 / input_size[0] * show_size[0])
x2 = int(x2 / input_size[0] * show_size[0])
y1 = int(y1 / input_size[1] * show_size[1])
y2 = int(y2 / input_size[1] * show_size[1])
class_id = int(class_id)
cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
class_name = classes[class_id]
cv2.putText(image, class_name, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
return image
def main():
input_size = (128, 128)
with open('ultralytics/cfg/datasets/coco.yaml', 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
classes = data['names']
window_name = 'FullScreen Image'
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
session = ort.InferenceSession('yolov10n.onnx')
input_name = session.get_inputs()[0].name
cap = cv2.VideoCapture(0)
if not cap.isOpened():
print("Cannot open camera")
exit()
prev_time = 0
while True:
ret, frame = cap.read()
show_size = [320,240]
if not ret:
print("Can't receive frame (stream end?). Exiting ...")
break
current_time = time.time()
input_tensor = preprocess(frame, input_size)
outputs = session.run(None, {input_name: input_tensor})
output = outputs[0][0]
show_image = postprocess(output, frame, input_size, show_size, classes)
fps = 1.0 / (current_time - prev_time)
prev_time = current_time
cv2.putText(show_image, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
cv2.imshow(window_name, show_image)
if cv2.waitKey(1) == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
if __name__ == '__main__':
main()
The code for UNIHIKER + USB camera to use Mediapipe for object detection is as follows:
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import cv2
import time
input_size = (640,640)
# STEP 1: Import the necessary modules.
base_options = python.BaseOptions(model_asset_path='efficientdet_lite0.tflite')
options = vision.ObjectDetectorOptions(base_options=base_options,
score_threshold=0.5)
detector = vision.ObjectDetector.create_from_options(options)
# STEP 2: Create an ObjectDetector object.
MARGIN = 10 # pixels
ROW_SIZE = 10 # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
TEXT_COLOR = (255, 0, 0) # red
def visualize(image, detection_result) -> np.ndarray:
"""Draws bounding boxes on the input image and return it.
Args:
image: The input RGB image.
detection_result: The list of all "Detection" entities to be visualize.
Returns:
Image with bounding boxes.
"""
for detection in detection_result.detections:
# Draw bounding_box
bbox = detection.bounding_box
start_point = bbox.origin_x, bbox.origin_y
end_point = bbox.origin_x + bbox.width, bbox.origin_y + bbox.height
cv2.rectangle(image, start_point, end_point, TEXT_COLOR, 3)
# Draw label and score
category = detection.categories[0]
category_name = category.category_name
probability = round(category.score, 2)
result_text = category_name + ' (' + str(probability) + ')'
text_location = (MARGIN + bbox.origin_x,
MARGIN + ROW_SIZE + bbox.origin_y)
cv2.putText(image, result_text, text_location, cv2.FONT_HERSHEY_PLAIN,
FONT_SIZE, TEXT_COLOR, FONT_THICKNESS)
return image
# STEP 3: Initialize the video capture from the webcam.
cap = cv2.VideoCapture(0)
prev_time = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame = cv2.resize(frame, input_size, interpolation=cv2.INTER_NEAREST)
# Convert the frame to the format required by MediaPipe.
image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
# STEP 4: Detect objects in the frame.
detection_result = detector.detect(image)
# STEP 5: Process the detection result. In this case, visualize it.
annotated_frame = visualize(frame, detection_result)
# Calculate and display the frame rate
current_time = time.time()
fps = 1 / (current_time - prev_time)
prev_time = current_time
fps_text = f'FPS: {fps:.2f}'
print(fps_text)
cv2.putText(annotated_frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow('Object Detection', annotated_frame)
# Break the loop if the user presses 'q'.
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Release the resources.
cap.release()
cv2.destroyAllWindows()
If you need any help or want to join more discussions, feel free to join our Discord: https://discord.gg/PVAWBMPwsk