LLM &
DEEP LEARNING
Integrasi LLM, speech recognition, TTS, vision language model, ReAct reasoning, RAG memory, dan embodied AI pipeline untuk robot cerdas — dengan elektronika dan source code lengkap.
PLATFORM HARDWARE
Platform hardware untuk robot dengan LLM integration memerlukan komputasi edge yang cukup kuat untuk inference model. Berikut perbandingan platform populer:
| Platform | CPU | RAM | GPU/NPU | Power | Best For |
|---|---|---|---|---|---|
| Raspberry Pi 5 | Cortex-A76 4-core | 8GB LPDDR4X | None | 5–10W | Lightweight inference |
| NVIDIA Jetson Orin NX | 12-core Arm A78 | 16GB | 1024-core Ampere+DLAM | 10–25W | Local LLM + Vision |
| Jetson AGX Orin | 12-core Arm A78 | 64GB | 2048-core Ampere | 15–60W | Full LLM robot |
| Intel NUC i7 | x86 i7 | 32GB DDR5 | Iris Xe | 28W | ROS2 + inference |
| Orange Pi 5 Plus | Cortex-A76 | 16GB LPDDR5 | Mali-G610 | 5–15W | Budget option |
Skema Koneksi Hardware
# Hardware connectivity diagram:
# ReSpeaker USB Mic --USB--> Jetson Orin
# OAK-D Lite Camera --USB3-> Jetson Orin
# RPLiDAR A3 --USB--> Jetson Orin
# |
# Jetson Orin (ROS2, LLM inference, Nav2)
# | UART/USB
# Arduino Mega (PID, encoder, sensors)
# | PWM/GPIO
# Motor Driver (VESC/L298N)
# | DC
# Drive Motors (BLDC/DC)I2S Audio Output — Arduino + MAX98357A
// Arduino ESP32: I2S audio output untuk TTS playback #include <driver/i2s.h> const i2s_config_t i2s_cfg = { .mode = (i2s_mode_t)(I2S_MODE_MASTER|I2S_MODE_TX), .sample_rate = 22050, .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, .communication_format = I2S_COMM_FORMAT_I2S_MSB, .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1, .dma_buf_count = 8, .dma_buf_len = 64 }; const i2s_pin_config_t pin_cfg = { .bck_io_num=26, .ws_io_num=25, .data_out_num=22, .data_in_num=-1}; void playAudio(int16_t* samples, size_t count){ size_t written; i2s_write(I2S_NUM_0, samples, count*2, &written, portMAX_DELAY);}
MIKROFON & SPEECH INPUT
ReSpeaker + WebRTC VAD
import pyaudio, webrtcvad, numpy as np class VoiceActivityDetector: def __init__(self, aggressiveness=2, sample_rate=16000): self.vad = webrtcvad.Vad(aggressiveness) # 0-3 self.sr = sample_rate self.frame_ms = 30 # 10, 20, atau 30 ms self.frame_sz = int(sample_rate * self.frame_ms / 1000 * 2) # bytes self.p = pyaudio.PyAudio() def record_utterance(self, silence_ms=800, max_sec=30): stream = self.p.open(rate=self.sr, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=self.frame_sz//2) frames=[]; silent_ms=0; speaking=False print("Listening...") while True: data = stream.read(self.frame_sz//2) is_speech = self.vad.is_speech(data, self.sr) if is_speech: speaking=True; silent_ms=0; frames.append(data) elif speaking: frames.append(data); silent_ms+=self.frame_ms if silent_ms >= silence_ms: break stream.close() return np.frombuffer(b''.join(frames), dtype=np.int16)
WHISPER SPEECH RECOGNITION
OpenAI Whisper adalah model ASR (Automatic Speech Recognition) multi-bahasa, berjalan lokal. Tersedia model small/medium/large. Untuk Indonesia: Bahasa Indonesia didukung natively.
import whisper, numpy as np class WhisperSTT: def __init__(self, model_size='small', device='cuda'): print(f"Loading Whisper {model_size}...") self.model = whisper.load_model(model_size, device=device) # small: 244M params, medium: 769M, large: 1550M def transcribe(self, audio_np, language='id'): """audio_np: float32 normalized [-1,1] at 16kHz""" audio = audio_np.astype(np.float32) / 32768.0 result = self.model.transcribe( audio, language=language, task='transcribe', beam_size=5, temperature=0, no_speech_threshold=0.6, condition_on_previous_text=False) return result['text'].strip() def transcribe_realtime(self, audio_np): """Streaming transcription dengan sliding window""" audio = audio_np.astype(np.float32) / 32768.0 # Pad to 30s jika pendek padded = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(padded).to(self.model.device) _, probs = self.model.detect_language(mel) lang = max(probs, key=probs.get) options = whisper.DecodingOptions(language=lang, fp16=True) result = whisper.decode(self.model, mel, options) return result.text
TTS — Coqui XTTS / Edge-TTS
import edge_tts, asyncio async def speak(text, voice='id-ID-GadisNeural'): """Microsoft Edge TTS — bahasa Indonesia""" tts = edge_tts.Communicate(text, voice) await tts.save('/tmp/speech.mp3') import subprocess subprocess.run(['mpg321','/tmp/speech.mp3']) # Voices Indonesia: # id-ID-GadisNeural (female) # id-ID-ArdiNeural (male) asyncio.run(speak("Robot siap melaksanakan perintah"))
TEXT-TO-SPEECH OFFLINE
from TTS.api import TTS # Coqui TTS, fully offline tts = TTS('tts_models/multilingual/multi-dataset/xtts_v2').to('cuda') def synthesize(text, speaker_wav='speaker.wav', language='id'): tts.tts_to_file( text=text, speaker_wav=speaker_wav, # voice cloning dari 3 detik audio language=language, file_path='/tmp/out.wav') # Streaming TTS real-time: import sounddevice as sd def speak_stream(text): chunks = tts.tts_with_vc_to_bytes(text, speaker_wav='speaker.wav') import numpy as np audio = np.frombuffer(chunks, dtype=np.int16) sd.play(audio, samplerate=22050); sd.wait()
LLM TASK PLANNING
LLM digunakan sebagai high-level task planner yang mengkonversi perintah bahasa alami menjadi sequence aksi robot primitif. Robot memiliki set tool functions yang dipanggil LLM.
from openai import OpenAI import json, re client = OpenAI() # atau gunakan local Ollama endpoint # Definisi robot tools / primitives ROBOT_TOOLS = [ {"type":"function","function":{"name":"move_forward","description":"Gerakkan robot maju","parameters":{"type":"object","properties":{"distance_m":{"type":"number","description":"Jarak dalam meter"},"speed":{"type":"number","default":0.3}},"required":["distance_m"]}}}, {"type":"function","function":{"name":"turn","description":"Putar robot","parameters":{"type":"object","properties":{"angle_deg":{"type":"number","description":"Sudut putar, positif=kiri"},"speed":{"type":"number","default":0.5}},"required":["angle_deg"]}}}, {"type":"function","function":{"name":"navigate_to","description":"Navigasi ke koordinat","parameters":{"type":"object","properties":{"x":{"type":"number"},"y":{"type":"number"},"frame":{"type":"string","default":"map"}},"required":["x","y"]}}}, {"type":"function","function":{"name":"pick_object","description":"Ambil objek","parameters":{"type":"object","properties":{"object_name":{"type":"string"}},"required":["object_name"]}}}, {"type":"function","function":{"name":"speak","description":"Robot bicara","parameters":{"type":"object","properties":{"text":{"type":"string"}},"required":["text"]}}} ] SYSTEM_PROMPT = """ Kamu adalah robot assistant cerdas bernama ARIA. Gunakan tools yang tersedia untuk menjalankan perintah pengguna. Selalu confirm aksi sebelum mengeksekusi. Bahasa Indonesia diutamakan. """ class RobotLLMAgent: def __init__(self, robot_interface): self.robot = robot_interface self.history = [{"role":"system","content":SYSTEM_PROMPT}] def execute_tool(self, name, args): if name=='move_forward': self.robot.move(args['distance_m']) elif name=='turn': self.robot.turn(args['angle_deg']) elif name=='navigate_to': self.robot.navigate(args['x'],args['y']) elif name=='pick_object': self.robot.pick(args['object_name']) elif name=='speak': self.robot.speak(args['text']) return f"OK: {name} executed" def process_command(self, user_input): self.history.append({"role":"user","content":user_input}) response = client.chat.completions.create( model="gpt-4o-mini", messages=self.history, tools=ROBOT_TOOLS, tool_choice="auto") msg = response.choices[0].message if msg.tool_calls: results = [] for call in msg.tool_calls: args = json.loads(call.function.arguments) res = self.execute_tool(call.function.name, args) results.append(res) self.history.append({"role":"assistant","content":f"Executed: {results}"}) return msg.content or "Actions executed"
ReAct REASONING
ReAct (Reason + Act) adalah prompting strategy dimana LLM secara bergantian melakukan reasoning dan mengambil aksi berdasarkan observasi environment.
def react_agent(task, robot, max_steps=10): """ReAct robot agent loop""" prompt = f""" Kamu adalah robot AI. Selesaikan tugas dengan loop Thought->Action->Observation. Aksi tersedia: - scan_environment() -> list obstacles dan objek - navigate_to(x, y) -> bergerak ke koordinat - pick(object) -> ambil objek - place(location) -> taruh objek - check_goal() -> cek apakah goal tercapai Task: {task} Format output: Thought: [reasoning] Action: function_name(args) """ messages = [{"role":"user","content":prompt}] for step in range(max_steps): response = client.chat.completions.create( model="gpt-4o", messages=messages) output = response.choices[0].message.content print(f"Step {step}: {output}") if "Action:" in output: action_line = [l for l in output.split('\n') if l.startswith('Action:')][0] action_str = action_line.replace('Action:','').strip() obs = robot.execute(action_str) messages.append({"role":"assistant","content":output}) messages.append({"role":"user","content":f"Observation: {obs}"}) elif "DONE" in output: break return "Task complete"
LOCAL LLM DENGAN OLLAMA
Ollama memungkinkan menjalankan LLM (Llama 3, Mistral, Phi-3) secara offline di Jetson atau server lokal. Penting untuk privasi dan latensi rendah.
import ollama # Install: curl -fsSL https://ollama.ai/install.sh | sh # Pull model: ollama pull llama3.2:3b class LocalRobotLLM: def __init__(self, model='llama3.2:3b'): self.model = model self.history = [] self.system = """Kamu adalah robot AI bernama ARIA. Tugasmu membantu manusia dengan melakukan navigasi, mengambil objek, dan memberikan informasi. Gunakan Bahasa Indonesia.""" def chat(self, user_msg): self.history.append({"role":"user","content":user_msg}) response = ollama.chat( model=self.model, messages=[{"role":"system","content":self.system}] + self.history, options={"temperature":0.1,"num_ctx":4096}) reply = response['message']['content'] self.history.append({"role":"assistant","content":reply}) return reply def plan_task(self, task, current_state): prompt = f"Situasi robot: {current_state}\nTugas: {task}\nBuat rencana langkah demi langkah dalam format JSON." response = ollama.generate( model=self.model, prompt=prompt, format='json', options={"temperature":0}) import json return json.loads(response['response']) # Contoh model size/speed on Jetson Orin NX 16GB: # phi3:mini (3.8B) → ~15 tok/s # llama3.2:3b → ~12 tok/s # mistral:7b → ~7 tok/s (butuh quantization) # llava:7b → ~6 tok/s (multimodal, bisa lihat gambar)
PIPELINE EMBODIED AI
Embodied AI mengintegrasikan persepsi (vision + audio), reasoning (LLM), dan aksi (robot control) dalam loop yang continuous.
import asyncio, threading from queue import Queue class EmbodiedRobot: def __init__(self): self.cmd_q = Queue() self.obs_q = Queue() self.llm = LocalRobotLLM('llava:7b') # multimodal self.stt = WhisperSTT('small') self.vad = VoiceActivityDetector() self.camera = cv2.VideoCapture(0) def perception_thread(self): """Continuous perception loop""" while True: ret, frame = self.camera.read() self.obs_q.put({"type":"vision","data":frame}) audio = self.vad.record_utterance() if audio is not None: text = self.stt.transcribe(audio) self.cmd_q.put({"type":"speech","text":text}) def reasoning_thread(self): """LLM reasoning + action planning""" while True: cmd = self.cmd_q.get() obs = None try: obs = self.obs_q.get_nowait() except: pass plan = self.llm.plan_task(cmd['text'], obs) self.execute_plan(plan) def run(self): t1=threading.Thread(target=self.perception_thread, daemon=True) t2=threading.Thread(target=self.reasoning_thread, daemon=True) t1.start(); t2.start(); t1.join()
RAG ROBOT MEMORY
Robot dapat memiliki memori episodik yang bisa di-query dengan RAG (Retrieval Augmented Generation). Sangat berguna untuk robot yang bekerja di lingkungan familiar.
from langchain.vectorstores import Chroma from langchain.embeddings import OllamaEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter import datetime class RobotMemory: """Episodic memory dengan vector search""" def __init__(self): self.embed = OllamaEmbeddings(model='nomic-embed-text') self.db = Chroma(persist_directory='robot_memory', embedding_function=self.embed) def remember(self, event, metadata=None): doc = f"[{datetime.datetime.now().isoformat()}] {event}" meta = metadata or {} meta['timestamp'] = str(datetime.datetime.now()) self.db.add_texts([doc], metadatas=[meta]) def recall(self, query, k=5): results = self.db.similarity_search(query, k=k) return [r.page_content for r in results] def update_map_knowledge(self, area, description): self.remember( f"Area {area}: {description}", metadata={"type":"map","area":area}) # Contoh penggunaan: memory = RobotMemory() memory.remember("Ditemukan kotak merah di koordinat (2.5, 1.0)") memory.remember("Pintu ruang tamu sering tertutup sore hari") memory.update_map_knowledge("dapur", "Ada kulkas dan meja, pencahayaan terang") hasil = memory.recall("dimana kotak merah?") print(hasil) # → "Ditemukan kotak merah di (2.5, 1.0)"
VISION LANGUAGE MODEL
Model seperti LLaVA, InternVL, atau Moondream memungkinkan robot memahami scene kamera dalam bahasa alami.
import ollama, cv2, base64, numpy as np class VisionLM: def __init__(self, model='llava:7b'): self.model = model def describe_scene(self, frame): _, buf = cv2.imencode('.jpg', frame) img_b64 = base64.b64encode(buf).decode() res = ollama.generate( model=self.model, prompt="Deskripsikan scene ini untuk robot. Sebutkan: objek, posisi relatif, hal yang perlu diperhatikan untuk navigasi. Bahasa Indonesia.", images=[img_b64]) return res['response'] def find_object(self, frame, object_name): _, buf = cv2.imencode('.jpg', frame) img_b64 = base64.b64encode(buf).decode() res = ollama.generate( model=self.model, prompt=f"Apakah ada {object_name} dalam gambar? Jika ada, sebutkan posisinya (kiri/tengah/kanan, dekat/jauh). Jawab JSON: {{\"found\": bool, \"position\": string}}", images=[img_b64], format='json') import json return json.loads(res['response']) # Contoh: robot mencari gelas air vlm = VisionLM() cap = cv2.VideoCapture(0) ret, frame = cap.read() result = vlm.find_object(frame, "gelas air") print(result) # {"found": true, "position": "kanan, dekat"}
Antonius - bluedragonsec.com