YOLO & Detectron2: Real-Time Object Detection

Object Detection Overview

Object detection combines classification and localization - identifying what objects are in an image and where they're located. It's essential for autonomous vehicles, surveillance, retail analytics, and more.

YOLO and Detectron2 are the two most popular frameworks for production object detection.

YOLO: You Only Look Once

YOLO revolutionized object detection by treating it as a single regression problem, enabling real-time detection:

Speed: Can process 30+ FPS on modern GPUs
End-to-end: Single neural network predicts boxes and classes
Versions: YOLOv5, YOLOv7, YOLOv8 (Ultralytics)

Getting Started with YOLOv8

# Install Ultralytics
pip install ultralytics

from ultralytics import YOLO

# Load pretrained model
model = YOLO('yolov8n.pt')  # nano version (fastest)
# Options: yolov8n, yolov8s, yolov8m, yolov8l, yolov8x

# Inference on image
results = model('image.jpg')

# Display results
results[0].show()

# Access predictions
for result in results:
    boxes = result.boxes  # Bounding boxes
    for box in boxes:
        cls = int(box.cls[0])  # Class index
        conf = float(box.conf[0])  # Confidence
        xyxy = box.xyxy[0].tolist()  # Box coordinates
        class_name = model.names[cls]
        print(f"{class_name}: {conf:.2f} at {xyxy}")

# Save results
results[0].save('output.jpg')

Video Detection with YOLO

from ultralytics import YOLO
import cv2

model = YOLO('yolov8n.pt')

# Process video file
results = model('video.mp4', stream=True)

for result in results:
    frame = result.plot()  # Draw boxes on frame
    cv2.imshow('Detection', frame)
    if cv2.waitKey(1) == ord('q'):
        break

cv2.destroyAllWindows()

# Or process webcam
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)
    annotated = results[0].plot()

    cv2.imshow('Webcam Detection', annotated)
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Training Custom YOLO Model

# Data structure:
# dataset/
#   train/
#     images/
#     labels/
#   val/
#     images/
#     labels/

# data.yaml
"""
path: ./dataset
train: train/images
val: val/images

names:
  0: cat
  1: dog
  2: bird
"""

from ultralytics import YOLO

# Load base model
model = YOLO('yolov8n.pt')

# Train on custom dataset
results = model.train(
    data='data.yaml',
    epochs=100,
    imgsz=640,
    batch=16,
    name='custom_detector'
)

# Validate
metrics = model.val()
print(f"mAP50: {metrics.box.map50}")
print(f"mAP50-95: {metrics.box.map}")

# Export for deployment
model.export(format='onnx')  # Or 'tflite', 'torchscript'

YOLO Label Format

# YOLO uses normalized coordinates
# Each line in label file:
# class_id x_center y_center width height

# Example: label.txt
# 0 0.5 0.5 0.3 0.4
# 1 0.2 0.3 0.1 0.15

# Converting from other formats
def convert_bbox_to_yolo(img_width, img_height, x_min, y_min, x_max, y_max):
    """Convert bounding box to YOLO format."""
    x_center = (x_min + x_max) / 2 / img_width
    y_center = (y_min + y_max) / 2 / img_height
    width = (x_max - x_min) / img_width
    height = (y_max - y_min) / img_height
    return x_center, y_center, width, height

# Labelme to YOLO conversion
import json
import os

def labelme_to_yolo(json_file, class_names):
    with open(json_file) as f:
        data = json.load(f)

    img_width = data['imageWidth']
    img_height = data['imageHeight']

    yolo_labels = []
    for shape in data['shapes']:
        label = shape['label']
        class_id = class_names.index(label)
        points = shape['points']
        x_min = min(p[0] for p in points)
        y_min = min(p[1] for p in points)
        x_max = max(p[0] for p in points)
        y_max = max(p[1] for p in points)

        yolo_bbox = convert_bbox_to_yolo(img_width, img_height, x_min, y_min, x_max, y_max)
        yolo_labels.append(f"{class_id} {' '.join(map(str, yolo_bbox))}")

    return '\n'.join(yolo_labels)

Detectron2: Facebook's Detection Framework

# Install Detectron2
pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu118/torch2.0/index.html

import detectron2
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog
import cv2

# Setup configuration
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
)

# Create predictor
predictor = DefaultPredictor(cfg)

# Run inference
image = cv2.imread("image.jpg")
outputs = predictor(image)

# Visualize
v = Visualizer(image[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]))
out = v.draw_instance_predictions(outputs["instances"].to("cpu"))

cv2.imshow("Detection", out.get_image()[:, :, ::-1])
cv2.waitKey(0)

Instance Segmentation with Detectron2

from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo

# Mask R-CNN for instance segmentation
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
    "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
)

predictor = DefaultPredictor(cfg)
outputs = predictor(image)

# Access masks
instances = outputs["instances"]
masks = instances.pred_masks  # Boolean masks for each instance
boxes = instances.pred_boxes
classes = instances.pred_classes
scores = instances.scores

Training Custom Detectron2 Model

from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from detectron2 import model_zoo
import os

# Register custom dataset
def get_custom_dicts(img_dir):
    # Return list of dicts with keys:
    # file_name, height, width, image_id, annotations
    # Each annotation: bbox, bbox_mode, category_id, (segmentation)
    pass

DatasetCatalog.register("custom_train", lambda: get_custom_dicts("train/"))
DatasetCatalog.register("custom_val", lambda: get_custom_dicts("val/"))
MetadataCatalog.get("custom_train").set(thing_classes=["class1", "class2"])

# Configure training
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
))
cfg.DATASETS.TRAIN = ("custom_train",)
cfg.DATASETS.TEST = ("custom_val",)
cfg.DATALOADER.NUM_WORKERS = 4
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
)
cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 3000
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2  # Your number of classes

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

YOLO vs Detectron2

YOLO: Faster inference, easier to use, better for real-time
Detectron2: More accurate, better for research, more flexible
Use YOLO: When speed matters (edge devices, real-time apps)
Use Detectron2: When accuracy matters (medical imaging, research)

Deployment Tips

# Export YOLO to ONNX for deployment
from ultralytics import YOLO

model = YOLO('best.pt')
model.export(format='onnx', dynamic=True, simplify=True)

# Run with ONNX Runtime
import onnxruntime as ort
import numpy as np
import cv2

session = ort.InferenceSession('best.onnx')

def preprocess(image):
    img = cv2.resize(image, (640, 640))
    img = img.transpose(2, 0, 1)  # HWC to CHW
    img = img.astype(np.float32) / 255.0
    return np.expand_dims(img, axis=0)

def detect(image):
    input_tensor = preprocess(image)
    outputs = session.run(None, {'images': input_tensor})
    return outputs

# TensorRT for maximum speed
# Export: model.export(format='engine')  # Creates TensorRT engine

Build Object Detection Systems

Our Data Science program covers computer vision and object detection with hands-on projects. Learn to build and deploy real-time detection systems.

Explore Data Science Program

YOLO & Detectron2