// DOC 03 / 07
COMPUTER
VISION
Computer vision komprehensif untuk robot: YOLOv8, object tracking ByteTrack, SAM segmentation, depth estimation, stereo vision, point cloud, grasp detection, ArUco markers, pipeline ROS2.
01
Overview
PIPELINE CV ROBOT
Computer vision untuk robot mencakup spektrum tugas mulai dari deteksi objek sederhana hingga 3D scene understanding. Pipeline umum:
📷
Capture
Frame dari RGB cam, depth cam, atau stereo rig
🔍
Detect
YOLOv8/RT-DETR untuk bounding box + class
✂️
Segment
SAM/Mask R-CNN untuk pixel-level mask
📐
Depth
Stereo disparity, MiDaS, ZoeDepth
🤖
Act
Grasp point, navigation target, manipulation
🔄
Track
ByteTrack/DeepSORT untuk multi-object tracking
Kamera untuk Robot
| Kamera | Type | Resolusi | Depth | FPS | Use Case |
|---|---|---|---|---|---|
| OAK-D Lite | Stereo+RGB | 4K RGB / 640p depth | √ 0.2-35m | 60 | Manipulation, nav |
| Intel RealSense D435 | Stereo IR | 1920×1080 | √ 0.3-3m | 90 | Indoor close range |
| ZED 2 | Stereo HD | 4K | √ 0.3-20m | 100 | Outdoor navigation |
| Picamera V3 | Mono RGB | 12MP | ✗ | 30 | Lightweight robot |
| OAK-1 | Mono RGB | 12MP | ✗ (AI) | 60 | AI inference, low latency |
02
Hardware
KAMERA SETUP & KALIBRASI
Kalibrasi Kamera (OpenCV)
Camera Intrinsic Matrix K:
| fx 0 cx |
K = | 0 fy cy |
| 0 0 1 |
Distortion: [k1, k2, p1, p2, k3]
Undistort pixel:
x_corrected = (x - cx)/fx
r^2 = x^2 + y^2
x_distorted = x*(1 + k1*r^2 + k2*r^4) + 2*p1*x*y + p2*(r^2+2*x^2)
import cv2, numpy as np, glob def calibrate_camera(images_path, grid=(9,6), square_mm=25): criteria=(cv2.TERM_CRITERIA_EPS+cv2.TERM_CRITERIA_MAX_ITER,30,0.001) objp=np.zeros((grid[0]*grid[1],3),np.float32) objp[:,:2]=np.mgrid[0:grid[0],0:grid[1]].T.reshape(-1,2)*square_mm obj_pts, img_pts = [], [] for fname in glob.glob(images_path): img=cv2.imread(fname); gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY) ret,corners=cv2.findChessboardCorners(gray,grid) if ret: obj_pts.append(objp) img_pts.append(cv2.cornerSubPix(gray,corners,(11,11),(-1,-1),criteria)) _, K, D, rvecs, tvecs = cv2.calibrateCamera(obj_pts,img_pts,gray.shape[::-1],None,None) print(f"RMS error: {_:.4f}") return K, D
03
Detection
YOLOv8 OBJECT DETECTION
YOLOv8 Inference + Custom Training
from ultralytics import YOLO import cv2, numpy as np model = YOLO('yolov8n.pt') # n=nano, s=small, m=medium, l=large, x=xlarge class ObjectDetector: def __init__(self, model_path='yolov8n.pt', conf=0.5, device='cuda'): self.model = YOLO(model_path) self.conf = conf self.model.to(device) def detect(self, frame): results = self.model.predict(frame, conf=self.conf, verbose=False) detections = [] for r in results: boxes = r.boxes if boxes is None: continue for box in boxes: x1,y1,x2,y2 = [int(v) for v in box.xyxy[0]] cls = int(box.cls[0]) conf = float(box.conf[0]) cx, cy = (x1+x2)//2, (y1+y2)//2 detections.append({ "bbox":[x1,y1,x2,y2], "class":cls, "label":self.model.names[cls], "conf":conf, "center":(cx,cy)}) return detections def detect_and_draw(self, frame): dets = self.detect(frame) for d in dets: x1,y1,x2,y2 = d['bbox'] cv2.rectangle(frame,(x1,y1),(x2,y2),(0,255,0),2) cv2.putText(frame,f"{d['label']} {d['conf']:.2f}",(x1,y1-10), cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,255,0),1) return frame, dets
Training Custom YOLOv8 (Dataset Sendiri)
# 1. Buat dataset.yaml # path: /data/robot_objects # train: images/train # val: images/val # names: {0: gelas, 1: botol, 2: buku, 3: kunci} # 2. Train model = YOLO('yolov8n.pt') model.train( data='dataset.yaml', epochs=100, imgsz=640, batch=16, device=0, augment=True, optimizer='AdamW', lr0=0.001, project='runs/robot')
04
Tracking
MULTI-OBJECT TRACKING
from ultralytics import YOLO import cv2 model = YOLO('yolov8n.pt') class RobotTracker: def __init__(self): self.model = YOLO('yolov8n.pt') self.track_history = {} self.colors = {} def track(self, frame): results = self.model.track(frame, persist=True, tracker='bytetrack.yaml') tracked = [] if results[0].boxes.id is not None: boxes = results[0].boxes.xyxy.cpu() ids = results[0].boxes.id.int().cpu().tolist() clss = results[0].boxes.cls.int().cpu().tolist() for box, tid, cls in zip(boxes, ids, clss): x1,y1,x2,y2 = map(int, box) cx,cy = (x1+x2)//2, (y1+y2)//2 if tid not in self.track_history: self.track_history[tid]=[] self.colors[tid]=tuple(int(c) for c in np.random.randint(100,255,3)) self.track_history[tid].append((cx,cy)) self.track_history[tid] = self.track_history[tid][-30:] # 30 frame trail tracked.append({"id":tid,"cls":cls,"bbox":[x1,y1,x2,y2],"center":(cx,cy)}) return tracked
05
Segmentation
SEMANTIC SEGMENTATION
from ultralytics import YOLO import cv2, numpy as np # YOLOv8 instance segmentation seg_model = YOLO('yolov8n-seg.pt') def segment_scene(frame): results = seg_model.predict(frame, conf=0.5, verbose=False) masks_out = [] overlay = frame.copy() for r in results: if r.masks is None: continue masks = r.masks.data.cpu().numpy() boxes = r.boxes for i, mask in enumerate(masks): cls = int(boxes.cls[i]) label = seg_model.names[cls] mask_resized = cv2.resize(mask, (frame.shape[1],frame.shape[0])) color = np.random.randint(100,255,3).tolist() overlay[mask_resized > 0.5] = color masks_out.append({"label":label,"mask":mask_resized}) return cv2.addWeighted(frame,0.7,overlay,0.3,0), masks_out
SAM — Segment Anything Model
from segment_anything import sam_model_registry, SamPredictor import torch sam = sam_model_registry['vit_b'](checkpoint='sam_vit_b.pth') sam.to('cuda') predictor = SamPredictor(sam) def segment_at_point(frame, point_xy): predictor.set_image(frame) masks,scores,_ = predictor.predict( point_coords=np.array([point_xy]), point_labels=np.array([1]), multimask_output=True) best = masks[scores.argmax()] return best # boolean mask HxW
06
Depth
DEPTH ESTIMATION
Depth estimation approaches:
1. Stereo: disparity d → depth Z = f*B/d (B=baseline, f=focal)
2. Structured light: IR pattern, e.g. RealSense
3. Monocular learned: MiDaS, ZoeDepth, DepthAnything
→ relative depth, perlu scale factor untuk metric depth
4. LiDAR fusion: sparse LiDAR + dense RGB → complete depth
import torch, cv2, numpy as np class DepthEstimator: """ZoeDepth untuk metric monocular depth""" def __init__(self): self.model = torch.hub.load('isl-org/ZoeDepth','ZoeD_NK',pretrained=True) self.model.eval().cuda() def estimate(self, frame_rgb): """Returns depth in meters""" from zoedepth.utils.misc import pil_to_batched_tensor from PIL import Image pil = Image.fromarray(frame_rgb) X = pil_to_batched_tensor(pil).cuda() with torch.no_grad(): depth = self.model.infer(X) return depth.squeeze().cpu().numpy() # HxW depth in meters def get_object_depth(self, depth_map, bbox): x1,y1,x2,y2 = bbox roi = depth_map[y1:y2, x1:x2] valid = roi[roi > 0.1] if len(valid)==0: return None return np.percentile(valid, 25) # median bawah = lebih akurat
07
3D Vision
STEREO VISION & DISPARITY
Stereo Geometry:
Baseline B = jarak antara dua kamera (meter)
Focal length f (pixels)
Disparity d = x_left - x_right (pixels)
Depth: Z = f * B / d
3D point: X = (x - cx) * Z / fx
Y = (y - cy) * Z / fy
Minimum measurable depth: Z_min = f*B/d_max
Maximum depth limited by noise in disparity
import cv2, numpy as np class StereoDepth: def __init__(self, K_L, D_L, K_R, D_R, R, T, img_size): self.R1,self.R2,self.P1,self.P2,self.Q,*_ = cv2.stereoRectify(K_L,D_L,K_R,D_R,img_size,R,T,alpha=0) self.map1L,self.map2L = cv2.initUndistortRectifyMap(K_L,D_L,self.R1,self.P1,img_size,cv2.CV_32FC1) self.map1R,self.map2R = cv2.initUndistortRectifyMap(K_R,D_R,self.R2,self.P2,img_size,cv2.CV_32FC1) self.stereo = cv2.StereoSGBM.create( minDisparity=0, numDisparities=128, blockSize=5, P1=8*3*5**2, P2=32*3*5**2, disp12MaxDiff=1, uniquenessRatio=10, speckleWindowSize=100, speckleRange=32) def compute_depth(self, left, right): rL = cv2.remap(left, self.map1L, self.map2L, cv2.INTER_LINEAR) rR = cv2.remap(right,self.map1R, self.map2R, cv2.INTER_LINEAR) gL = cv2.cvtColor(rL, cv2.COLOR_BGR2GRAY) gR = cv2.cvtColor(rR, cv2.COLOR_BGR2GRAY) disp = self.stereo.compute(gL,gR).astype(np.float32)/16 depth_3d = cv2.reprojectImageTo3D(disp, self.Q) return disp, depth_3d
08
3D
POINT CLOUD PROCESSING
import open3d as o3d import numpy as np def depth_to_pointcloud(depth, K, color=None): h,w = depth.shape fx,fy=K[0,0],K[1,1]; cx,cy=K[0,2],K[1,2] u,v=np.meshgrid(np.arange(w),np.arange(h)) Z=depth; X=(u-cx)*Z/fx; Y=(v-cy)*Z/fy pts=np.stack([X,Y,Z],axis=-1).reshape(-1,3) valid=(Z.flatten()>0.1) & (Z.flatten()10) pcd=o3d.geometry.PointCloud() pcd.points=o3d.utility.Vector3dVector(pts[valid]) if color is not None: c=color.reshape(-1,3)/255.0 pcd.colors=o3d.utility.Vector3dVector(c[valid]) return pcd def process_pointcloud(pcd): # Voxel downsampling ds=pcd.voxel_down_sample(voxel_size=0.02) # Remove outliers cl,_=ds.remove_statistical_outlier(nb_neighbors=20,std_ratio=2.0) # Estimate normals cl.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1,max_nn=30)) # RANSAC plane segmentation (floor removal) plane,inliers=cl.segment_plane(distance_threshold=0.01,ransac_n=3,num_iterations=1000) obstacles=cl.select_by_index(inliers, invert=True) return obstacles
09
Manipulation
GRASP POINT DETECTION
6-DOF Grasp Estimation:
1. Point cloud dari depth camera
2. Object segmentation (YOLO mask -> point subset)
3. PCA untuk principal axes
4. Approach vector = normal permukaan terbaik
5. Grasp center = bounding box center
Libraries: GraspNet-1Billion, Contact-GraspNet, AnyGrasp
import numpy as np from sklearn.decomposition import PCA def estimate_grasp(points_3d): """Simple PCA-based grasp estimation""" center = points_3d.mean(axis=0) pca = PCA(n_components=3).fit(points_3d) axes = pca.components_ # 3 principal axes widths = pca.explained_variance_**0.5 * 2 # Approach direction: smallest axis (shortest dimension) approach = axes[np.argmin(widths)] grip_width = min(widths[0], widths[1]) * 1.1 # 10% safety margin return { "center": center, "approach": approach, "grip_width_m": grip_width, "axes": axes }
010
Fiducials
ARUCO MARKERS
import cv2, cv2.aruco as aruco import numpy as np class ArucoDetector: def __init__(self, K, D, marker_size_m=0.1): self.K=K; self.D=D; self.sz=marker_size_m self.dict = aruco.getPredefinedDictionary(aruco.DICT_6X6_250) self.params = aruco.DetectorParameters() self.detector = aruco.ArucoDetector(self.dict, self.params) def detect(self, frame): gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) corners, ids, _ = self.detector.detectMarkers(gray) poses = [] if ids is None: return poses for i, c in enumerate(corners): rvec, tvec, _ = cv2.solvePnP( np.array([[0,0,0],[self.sz,0,0],[self.sz,self.sz,0],[0,self.sz,0]],np.float32), c.reshape(4,2), self.K, self.D) R, _ = cv2.Rodrigues(rvec) poses.append({"id":ids[i][0],"R":R,"t":tvec.flatten()}) return poses
011
ROS2
ROS2 VISION PIPELINE
import rclpy from rclpy.node import Node from sensor_msgs.msg import Image, CameraInfo from vision_msgs.msg import Detection2DArray, Detection2D from cv_bridge import CvBridge from ultralytics import YOLO class VisionNode(Node): def __init__(self): super().__init__('vision_node') self.bridge = CvBridge() self.model = YOLO('yolov8n.pt') self.det_pub = self.create_publisher(Detection2DArray,'/detections',10) self.create_subscription(Image,'/camera/image_raw',self.img_cb,10) def img_cb(self, msg): frame = self.bridge.imgmsg_to_cv2(msg, desired_encoding='bgr8') results = self.model.predict(frame, verbose=False) det_arr = Detection2DArray() det_arr.header = msg.header for r in results: for box in r.boxes: d = Detection2D() d.bbox.center.position.x = float((box.xyxy[0][0]+box.xyxy[0][2])/2) d.bbox.center.position.y = float((box.xyxy[0][1]+box.xyxy[0][3])/2) d.bbox.size_x = float(box.xyxy[0][2]-box.xyxy[0][0]) d.bbox.size_y = float(box.xyxy[0][3]-box.xyxy[0][1]) det_arr.detections.append(d) self.det_pub.publish(det_arr) rclpy.init(); n=VisionNode(); rclpy.spin(n)
COMPUTER VISION ROBOT — DOC 03 / 07
Antonius - bluedragonsec.com
Antonius - bluedragonsec.com