DOC 3 — COMPUTER VISION
// DOC 03 / 07

COMPUTER
VISION

Computer vision komprehensif untuk robot: YOLOv8, object tracking ByteTrack, SAM segmentation, depth estimation, stereo vision, point cloud, grasp detection, ArUco markers, pipeline ROS2.

YOLOv8SAMDepthStereo 3DROS2
01
Overview

PIPELINE CV ROBOT

Computer vision untuk robot mencakup spektrum tugas mulai dari deteksi objek sederhana hingga 3D scene understanding. Pipeline umum:

📷
Capture
Frame dari RGB cam, depth cam, atau stereo rig
🔍
Detect
YOLOv8/RT-DETR untuk bounding box + class
✂️
Segment
SAM/Mask R-CNN untuk pixel-level mask
📐
Depth
Stereo disparity, MiDaS, ZoeDepth
🤖
Act
Grasp point, navigation target, manipulation
🔄
Track
ByteTrack/DeepSORT untuk multi-object tracking

Kamera untuk Robot

KameraTypeResolusiDepthFPSUse Case
OAK-D LiteStereo+RGB4K RGB / 640p depth√ 0.2-35m60Manipulation, nav
Intel RealSense D435Stereo IR1920×1080√ 0.3-3m90Indoor close range
ZED 2Stereo HD4K√ 0.3-20m100Outdoor navigation
Picamera V3Mono RGB12MP30Lightweight robot
OAK-1Mono RGB12MP✗ (AI)60AI inference, low latency
02
Hardware

KAMERA SETUP & KALIBRASI

Kalibrasi Kamera (OpenCV)

Camera Intrinsic Matrix K: | fx 0 cx | K = | 0 fy cy | | 0 0 1 | Distortion: [k1, k2, p1, p2, k3] Undistort pixel: x_corrected = (x - cx)/fx r^2 = x^2 + y^2 x_distorted = x*(1 + k1*r^2 + k2*r^4) + 2*p1*x*y + p2*(r^2+2*x^2)
import cv2, numpy as np, glob

def calibrate_camera(images_path, grid=(9,6), square_mm=25):
    criteria=(cv2.TERM_CRITERIA_EPS+cv2.TERM_CRITERIA_MAX_ITER,30,0.001)
    objp=np.zeros((grid[0]*grid[1],3),np.float32)
    objp[:,:2]=np.mgrid[0:grid[0],0:grid[1]].T.reshape(-1,2)*square_mm
    obj_pts, img_pts = [], []
    for fname in glob.glob(images_path):
        img=cv2.imread(fname); gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        ret,corners=cv2.findChessboardCorners(gray,grid)
        if ret:
            obj_pts.append(objp)
            img_pts.append(cv2.cornerSubPix(gray,corners,(11,11),(-1,-1),criteria))
    _, K, D, rvecs, tvecs = cv2.calibrateCamera(obj_pts,img_pts,gray.shape[::-1],None,None)
    print(f"RMS error: {_:.4f}")
    return K, D
03
Detection

YOLOv8 OBJECT DETECTION

YOLOv8 Inference + Custom Training

from ultralytics import YOLO
import cv2, numpy as np

model = YOLO('yolov8n.pt')  # n=nano, s=small, m=medium, l=large, x=xlarge

class ObjectDetector:
    def __init__(self, model_path='yolov8n.pt', conf=0.5, device='cuda'):
        self.model = YOLO(model_path)
        self.conf = conf
        self.model.to(device)

    def detect(self, frame):
        results = self.model.predict(frame, conf=self.conf, verbose=False)
        detections = []
        for r in results:
            boxes = r.boxes
            if boxes is None: continue
            for box in boxes:
                x1,y1,x2,y2 = [int(v) for v in box.xyxy[0]]
                cls = int(box.cls[0])
                conf = float(box.conf[0])
                cx, cy = (x1+x2)//2, (y1+y2)//2
                detections.append({
                    "bbox":[x1,y1,x2,y2], "class":cls,
                    "label":self.model.names[cls], "conf":conf,
                    "center":(cx,cy)})
        return detections

    def detect_and_draw(self, frame):
        dets = self.detect(frame)
        for d in dets:
            x1,y1,x2,y2 = d['bbox']
            cv2.rectangle(frame,(x1,y1),(x2,y2),(0,255,0),2)
            cv2.putText(frame,f"{d['label']} {d['conf']:.2f}",(x1,y1-10),
                      cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,255,0),1)
        return frame, dets

Training Custom YOLOv8 (Dataset Sendiri)

# 1. Buat dataset.yaml
# path: /data/robot_objects
# train: images/train
# val: images/val
# names: {0: gelas, 1: botol, 2: buku, 3: kunci}

# 2. Train
model = YOLO('yolov8n.pt')
model.train(
    data='dataset.yaml',
    epochs=100,
    imgsz=640,
    batch=16,
    device=0,
    augment=True,
    optimizer='AdamW',
    lr0=0.001,
    project='runs/robot')
04
Tracking

MULTI-OBJECT TRACKING

from ultralytics import YOLO
import cv2

model = YOLO('yolov8n.pt')

class RobotTracker:
    def __init__(self):
        self.model = YOLO('yolov8n.pt')
        self.track_history = {}
        self.colors = {}

    def track(self, frame):
        results = self.model.track(frame, persist=True, tracker='bytetrack.yaml')
        tracked = []
        if results[0].boxes.id is not None:
            boxes = results[0].boxes.xyxy.cpu()
            ids   = results[0].boxes.id.int().cpu().tolist()
            clss  = results[0].boxes.cls.int().cpu().tolist()

            for box, tid, cls in zip(boxes, ids, clss):
                x1,y1,x2,y2 = map(int, box)
                cx,cy = (x1+x2)//2, (y1+y2)//2
                if tid not in self.track_history:
                    self.track_history[tid]=[]
                    self.colors[tid]=tuple(int(c) for c in np.random.randint(100,255,3))
                self.track_history[tid].append((cx,cy))
                self.track_history[tid] = self.track_history[tid][-30:]  # 30 frame trail
                tracked.append({"id":tid,"cls":cls,"bbox":[x1,y1,x2,y2],"center":(cx,cy)})
        return tracked
05
Segmentation

SEMANTIC SEGMENTATION

from ultralytics import YOLO
import cv2, numpy as np

# YOLOv8 instance segmentation
seg_model = YOLO('yolov8n-seg.pt')

def segment_scene(frame):
    results = seg_model.predict(frame, conf=0.5, verbose=False)
    masks_out = []
    overlay = frame.copy()
    for r in results:
        if r.masks is None: continue
        masks = r.masks.data.cpu().numpy()
        boxes = r.boxes
        for i, mask in enumerate(masks):
            cls = int(boxes.cls[i])
            label = seg_model.names[cls]
            mask_resized = cv2.resize(mask, (frame.shape[1],frame.shape[0]))
            color = np.random.randint(100,255,3).tolist()
            overlay[mask_resized > 0.5] = color
            masks_out.append({"label":label,"mask":mask_resized})
    return cv2.addWeighted(frame,0.7,overlay,0.3,0), masks_out

SAM — Segment Anything Model

from segment_anything import sam_model_registry, SamPredictor
import torch

sam = sam_model_registry['vit_b'](checkpoint='sam_vit_b.pth')
sam.to('cuda')
predictor = SamPredictor(sam)

def segment_at_point(frame, point_xy):
    predictor.set_image(frame)
    masks,scores,_ = predictor.predict(
        point_coords=np.array([point_xy]),
        point_labels=np.array([1]),
        multimask_output=True)
    best = masks[scores.argmax()]
    return best  # boolean mask HxW
06
Depth

DEPTH ESTIMATION

Depth estimation approaches: 1. Stereo: disparity d → depth Z = f*B/d (B=baseline, f=focal) 2. Structured light: IR pattern, e.g. RealSense 3. Monocular learned: MiDaS, ZoeDepth, DepthAnything → relative depth, perlu scale factor untuk metric depth 4. LiDAR fusion: sparse LiDAR + dense RGB → complete depth
import torch, cv2, numpy as np

class DepthEstimator:
    """ZoeDepth untuk metric monocular depth"""
    def __init__(self):
        self.model = torch.hub.load('isl-org/ZoeDepth','ZoeD_NK',pretrained=True)
        self.model.eval().cuda()

    def estimate(self, frame_rgb):
        """Returns depth in meters"""
        from zoedepth.utils.misc import pil_to_batched_tensor
        from PIL import Image
        pil = Image.fromarray(frame_rgb)
        X = pil_to_batched_tensor(pil).cuda()
        with torch.no_grad():
            depth = self.model.infer(X)
        return depth.squeeze().cpu().numpy()  # HxW depth in meters

    def get_object_depth(self, depth_map, bbox):
        x1,y1,x2,y2 = bbox
        roi = depth_map[y1:y2, x1:x2]
        valid = roi[roi > 0.1]
        if len(valid)==0: return None
        return np.percentile(valid, 25)  # median bawah = lebih akurat
07
3D Vision

STEREO VISION & DISPARITY

Stereo Geometry: Baseline B = jarak antara dua kamera (meter) Focal length f (pixels) Disparity d = x_left - x_right (pixels) Depth: Z = f * B / d 3D point: X = (x - cx) * Z / fx Y = (y - cy) * Z / fy Minimum measurable depth: Z_min = f*B/d_max Maximum depth limited by noise in disparity
import cv2, numpy as np

class StereoDepth:
    def __init__(self, K_L, D_L, K_R, D_R, R, T, img_size):
        self.R1,self.R2,self.P1,self.P2,self.Q,*_ = cv2.stereoRectify(K_L,D_L,K_R,D_R,img_size,R,T,alpha=0)
        self.map1L,self.map2L = cv2.initUndistortRectifyMap(K_L,D_L,self.R1,self.P1,img_size,cv2.CV_32FC1)
        self.map1R,self.map2R = cv2.initUndistortRectifyMap(K_R,D_R,self.R2,self.P2,img_size,cv2.CV_32FC1)
        self.stereo = cv2.StereoSGBM.create(
            minDisparity=0, numDisparities=128, blockSize=5,
            P1=8*3*5**2, P2=32*3*5**2,
            disp12MaxDiff=1, uniquenessRatio=10,
            speckleWindowSize=100, speckleRange=32)

    def compute_depth(self, left, right):
        rL = cv2.remap(left, self.map1L, self.map2L, cv2.INTER_LINEAR)
        rR = cv2.remap(right,self.map1R, self.map2R, cv2.INTER_LINEAR)
        gL = cv2.cvtColor(rL, cv2.COLOR_BGR2GRAY)
        gR = cv2.cvtColor(rR, cv2.COLOR_BGR2GRAY)
        disp = self.stereo.compute(gL,gR).astype(np.float32)/16
        depth_3d = cv2.reprojectImageTo3D(disp, self.Q)
        return disp, depth_3d
08
3D

POINT CLOUD PROCESSING

import open3d as o3d
import numpy as np

def depth_to_pointcloud(depth, K, color=None):
    h,w = depth.shape
    fx,fy=K[0,0],K[1,1]; cx,cy=K[0,2],K[1,2]
    u,v=np.meshgrid(np.arange(w),np.arange(h))
    Z=depth; X=(u-cx)*Z/fx; Y=(v-cy)*Z/fy
    pts=np.stack([X,Y,Z],axis=-1).reshape(-1,3)
    valid=(Z.flatten()>0.1) & (Z.flatten()10)
    pcd=o3d.geometry.PointCloud()
    pcd.points=o3d.utility.Vector3dVector(pts[valid])
    if color is not None:
        c=color.reshape(-1,3)/255.0
        pcd.colors=o3d.utility.Vector3dVector(c[valid])
    return pcd

def process_pointcloud(pcd):
    # Voxel downsampling
    ds=pcd.voxel_down_sample(voxel_size=0.02)
    # Remove outliers
    cl,_=ds.remove_statistical_outlier(nb_neighbors=20,std_ratio=2.0)
    # Estimate normals
    cl.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1,max_nn=30))
    # RANSAC plane segmentation (floor removal)
    plane,inliers=cl.segment_plane(distance_threshold=0.01,ransac_n=3,num_iterations=1000)
    obstacles=cl.select_by_index(inliers, invert=True)
    return obstacles
09
Manipulation

GRASP POINT DETECTION

6-DOF Grasp Estimation: 1. Point cloud dari depth camera 2. Object segmentation (YOLO mask -> point subset) 3. PCA untuk principal axes 4. Approach vector = normal permukaan terbaik 5. Grasp center = bounding box center Libraries: GraspNet-1Billion, Contact-GraspNet, AnyGrasp
import numpy as np
from sklearn.decomposition import PCA

def estimate_grasp(points_3d):
    """Simple PCA-based grasp estimation"""
    center = points_3d.mean(axis=0)
    pca = PCA(n_components=3).fit(points_3d)
    axes = pca.components_  # 3 principal axes
    widths = pca.explained_variance_**0.5 * 2

    # Approach direction: smallest axis (shortest dimension)
    approach = axes[np.argmin(widths)]
    grip_width = min(widths[0], widths[1]) * 1.1  # 10% safety margin

    return {
        "center": center,
        "approach": approach,
        "grip_width_m": grip_width,
        "axes": axes
    }
010
Fiducials

ARUCO MARKERS

import cv2, cv2.aruco as aruco
import numpy as np

class ArucoDetector:
    def __init__(self, K, D, marker_size_m=0.1):
        self.K=K; self.D=D; self.sz=marker_size_m
        self.dict = aruco.getPredefinedDictionary(aruco.DICT_6X6_250)
        self.params = aruco.DetectorParameters()
        self.detector = aruco.ArucoDetector(self.dict, self.params)

    def detect(self, frame):
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        corners, ids, _ = self.detector.detectMarkers(gray)
        poses = []
        if ids is None: return poses
        for i, c in enumerate(corners):
            rvec, tvec, _ = cv2.solvePnP(
                np.array([[0,0,0],[self.sz,0,0],[self.sz,self.sz,0],[0,self.sz,0]],np.float32),
                c.reshape(4,2), self.K, self.D)
            R, _ = cv2.Rodrigues(rvec)
            poses.append({"id":ids[i][0],"R":R,"t":tvec.flatten()})
        return poses
011
ROS2

ROS2 VISION PIPELINE

import rclpy
from rclpy.node import Node
from sensor_msgs.msg import Image, CameraInfo
from vision_msgs.msg import Detection2DArray, Detection2D
from cv_bridge import CvBridge
from ultralytics import YOLO

class VisionNode(Node):
    def __init__(self):
        super().__init__('vision_node')
        self.bridge = CvBridge()
        self.model  = YOLO('yolov8n.pt')
        self.det_pub = self.create_publisher(Detection2DArray,'/detections',10)
        self.create_subscription(Image,'/camera/image_raw',self.img_cb,10)

    def img_cb(self, msg):
        frame = self.bridge.imgmsg_to_cv2(msg, desired_encoding='bgr8')
        results = self.model.predict(frame, verbose=False)
        det_arr = Detection2DArray()
        det_arr.header = msg.header
        for r in results:
            for box in r.boxes:
                d = Detection2D()
                d.bbox.center.position.x = float((box.xyxy[0][0]+box.xyxy[0][2])/2)
                d.bbox.center.position.y = float((box.xyxy[0][1]+box.xyxy[0][3])/2)
                d.bbox.size_x = float(box.xyxy[0][2]-box.xyxy[0][0])
                d.bbox.size_y = float(box.xyxy[0][3]-box.xyxy[0][1])
                det_arr.detections.append(d)
        self.det_pub.publish(det_arr)

rclpy.init(); n=VisionNode(); rclpy.spin(n)
COMPUTER VISION ROBOT — DOC 03 / 07
Antonius - bluedragonsec.com