DOC 2 — LLM & DEEP LEARNING ROBOT
// DOC 02 / 07

LLM &
DEEP LEARNING

Integrasi LLM, speech recognition, TTS, vision language model, ReAct reasoning, RAG memory, dan embodied AI pipeline untuk robot cerdas — dengan elektronika dan source code lengkap.

Whisper STTLlama/GPTOllama LocalReActXTTS
01
Hardware

PLATFORM HARDWARE

Platform hardware untuk robot dengan LLM integration memerlukan komputasi edge yang cukup kuat untuk inference model. Berikut perbandingan platform populer:

PlatformCPURAMGPU/NPUPowerBest For
Raspberry Pi 5Cortex-A76 4-core8GB LPDDR4XNone5–10WLightweight inference
NVIDIA Jetson Orin NX12-core Arm A7816GB1024-core Ampere+DLAM10–25WLocal LLM + Vision
Jetson AGX Orin12-core Arm A7864GB2048-core Ampere15–60WFull LLM robot
Intel NUC i7x86 i732GB DDR5Iris Xe28WROS2 + inference
Orange Pi 5 PlusCortex-A7616GB LPDDR5Mali-G6105–15WBudget option
🎙️
ReSpeaker Mic Array
USB microphone array 4-mic untuk beam forming, far-field speech recognition. Driver siap untuk Linux.
🔊
USB Speaker
Compact USB speaker untuk TTS output. Atau gunakan I2S amplifier (MAX98357A) untuk audio lebih baik.
📷
OAK-D Lite
Stereo depth camera + neural inference accelerator (Myriad X). Ideal untuk edge vision.
🔋
LiPo BMS
4S 18Ah LiPo + BMS 30A. Cukup untuk Jetson + drive system selama 3-4 jam.

Skema Koneksi Hardware

# Hardware connectivity diagram:
# ReSpeaker USB Mic  --USB--> Jetson Orin
# OAK-D Lite Camera  --USB3-> Jetson Orin
# RPLiDAR A3         --USB--> Jetson Orin
#                              |
#                         Jetson Orin (ROS2, LLM inference, Nav2)
#                              | UART/USB
#                         Arduino Mega (PID, encoder, sensors)
#                              | PWM/GPIO
#                         Motor Driver (VESC/L298N)
#                              | DC
#                         Drive Motors (BLDC/DC)

I2S Audio Output — Arduino + MAX98357A

// Arduino ESP32: I2S audio output untuk TTS playback
#include <driver/i2s.h>

const i2s_config_t i2s_cfg = {
  .mode = (i2s_mode_t)(I2S_MODE_MASTER|I2S_MODE_TX),
  .sample_rate = 22050,
  .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
  .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
  .communication_format = I2S_COMM_FORMAT_I2S_MSB,
  .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
  .dma_buf_count = 8, .dma_buf_len = 64
};
const i2s_pin_config_t pin_cfg = {
  .bck_io_num=26, .ws_io_num=25, .data_out_num=22, .data_in_num=-1};

void playAudio(int16_t* samples, size_t count){
  size_t written;
  i2s_write(I2S_NUM_0, samples, count*2, &written, portMAX_DELAY);}
02
Audio

MIKROFON & SPEECH INPUT

ReSpeaker + WebRTC VAD

import pyaudio, webrtcvad, numpy as np

class VoiceActivityDetector:
    def __init__(self, aggressiveness=2, sample_rate=16000):
        self.vad = webrtcvad.Vad(aggressiveness)  # 0-3
        self.sr = sample_rate
        self.frame_ms = 30  # 10, 20, atau 30 ms
        self.frame_sz = int(sample_rate * self.frame_ms / 1000 * 2)  # bytes
        self.p = pyaudio.PyAudio()

    def record_utterance(self, silence_ms=800, max_sec=30):
        stream = self.p.open(rate=self.sr, channels=1,
                             format=pyaudio.paInt16, input=True,
                             frames_per_buffer=self.frame_sz//2)
        frames=[]; silent_ms=0; speaking=False

        print("Listening...")
        while True:
            data = stream.read(self.frame_sz//2)
            is_speech = self.vad.is_speech(data, self.sr)

            if is_speech:
                speaking=True; silent_ms=0; frames.append(data)
            elif speaking:
                frames.append(data); silent_ms+=self.frame_ms
                if silent_ms >= silence_ms: break

        stream.close()
        return np.frombuffer(b''.join(frames), dtype=np.int16)
03
Speech

WHISPER SPEECH RECOGNITION

OpenAI Whisper adalah model ASR (Automatic Speech Recognition) multi-bahasa, berjalan lokal. Tersedia model small/medium/large. Untuk Indonesia: Bahasa Indonesia didukung natively.

import whisper, numpy as np

class WhisperSTT:
    def __init__(self, model_size='small', device='cuda'):
        print(f"Loading Whisper {model_size}...")
        self.model = whisper.load_model(model_size, device=device)
        # small: 244M params, medium: 769M, large: 1550M

    def transcribe(self, audio_np, language='id'):
        """audio_np: float32 normalized [-1,1] at 16kHz"""
        audio = audio_np.astype(np.float32) / 32768.0
        result = self.model.transcribe(
            audio,
            language=language,
            task='transcribe',
            beam_size=5,
            temperature=0,
            no_speech_threshold=0.6,
            condition_on_previous_text=False)
        return result['text'].strip()

    def transcribe_realtime(self, audio_np):
        """Streaming transcription dengan sliding window"""
        audio = audio_np.astype(np.float32) / 32768.0
        # Pad to 30s jika pendek
        padded = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(padded).to(self.model.device)
        _, probs = self.model.detect_language(mel)
        lang = max(probs, key=probs.get)
        options = whisper.DecodingOptions(language=lang, fp16=True)
        result = whisper.decode(self.model, mel, options)
        return result.text

TTS — Coqui XTTS / Edge-TTS

import edge_tts, asyncio

async def speak(text, voice='id-ID-GadisNeural'):
    """Microsoft Edge TTS — bahasa Indonesia"""
    tts = edge_tts.Communicate(text, voice)
    await tts.save('/tmp/speech.mp3')
    import subprocess
    subprocess.run(['mpg321','/tmp/speech.mp3'])

# Voices Indonesia:
# id-ID-GadisNeural (female)
# id-ID-ArdiNeural  (male)

asyncio.run(speak("Robot siap melaksanakan perintah"))
04
TTS

TEXT-TO-SPEECH OFFLINE

from TTS.api import TTS  # Coqui TTS, fully offline

tts = TTS('tts_models/multilingual/multi-dataset/xtts_v2').to('cuda')

def synthesize(text, speaker_wav='speaker.wav', language='id'):
    tts.tts_to_file(
        text=text,
        speaker_wav=speaker_wav,  # voice cloning dari 3 detik audio
        language=language,
        file_path='/tmp/out.wav')

# Streaming TTS real-time:
import sounddevice as sd

def speak_stream(text):
    chunks = tts.tts_with_vc_to_bytes(text, speaker_wav='speaker.wav')
    import numpy as np
    audio = np.frombuffer(chunks, dtype=np.int16)
    sd.play(audio, samplerate=22050); sd.wait()
05
LLM

LLM TASK PLANNING

LLM digunakan sebagai high-level task planner yang mengkonversi perintah bahasa alami menjadi sequence aksi robot primitif. Robot memiliki set tool functions yang dipanggil LLM.

Pipeline: User Speech -> STT (Whisper) -> Natural Language Command -> LLM (GPT/Llama) + Tool Calling -> Robot Action Sequence -> Execute via ROS2 / Serial -> TTS Feedback -> Speaker
from openai import OpenAI
import json, re

client = OpenAI()  # atau gunakan local Ollama endpoint

# Definisi robot tools / primitives
ROBOT_TOOLS = [
  {"type":"function","function":{"name":"move_forward","description":"Gerakkan robot maju","parameters":{"type":"object","properties":{"distance_m":{"type":"number","description":"Jarak dalam meter"},"speed":{"type":"number","default":0.3}},"required":["distance_m"]}}},
  {"type":"function","function":{"name":"turn","description":"Putar robot","parameters":{"type":"object","properties":{"angle_deg":{"type":"number","description":"Sudut putar, positif=kiri"},"speed":{"type":"number","default":0.5}},"required":["angle_deg"]}}},
  {"type":"function","function":{"name":"navigate_to","description":"Navigasi ke koordinat","parameters":{"type":"object","properties":{"x":{"type":"number"},"y":{"type":"number"},"frame":{"type":"string","default":"map"}},"required":["x","y"]}}},
  {"type":"function","function":{"name":"pick_object","description":"Ambil objek","parameters":{"type":"object","properties":{"object_name":{"type":"string"}},"required":["object_name"]}}},
  {"type":"function","function":{"name":"speak","description":"Robot bicara","parameters":{"type":"object","properties":{"text":{"type":"string"}},"required":["text"]}}}
]

SYSTEM_PROMPT = """
Kamu adalah robot assistant cerdas bernama ARIA.
Gunakan tools yang tersedia untuk menjalankan perintah pengguna.
Selalu confirm aksi sebelum mengeksekusi.
Bahasa Indonesia diutamakan.
"""

class RobotLLMAgent:
    def __init__(self, robot_interface):
        self.robot = robot_interface
        self.history = [{"role":"system","content":SYSTEM_PROMPT}]

    def execute_tool(self, name, args):
        if   name=='move_forward': self.robot.move(args['distance_m'])
        elif name=='turn':         self.robot.turn(args['angle_deg'])
        elif name=='navigate_to':  self.robot.navigate(args['x'],args['y'])
        elif name=='pick_object':  self.robot.pick(args['object_name'])
        elif name=='speak':        self.robot.speak(args['text'])
        return f"OK: {name} executed"

    def process_command(self, user_input):
        self.history.append({"role":"user","content":user_input})
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.history,
            tools=ROBOT_TOOLS,
            tool_choice="auto")

        msg = response.choices[0].message
        if msg.tool_calls:
            results = []
            for call in msg.tool_calls:
                args = json.loads(call.function.arguments)
                res  = self.execute_tool(call.function.name, args)
                results.append(res)
            self.history.append({"role":"assistant","content":f"Executed: {results}"})
        return msg.content or "Actions executed"
06
LLM

ReAct REASONING

ReAct (Reason + Act) adalah prompting strategy dimana LLM secara bergantian melakukan reasoning dan mengambil aksi berdasarkan observasi environment.

ReAct Loop: Thought: [reasoning tentang situasi] Action: [pilih aksi dan parameter] Observation: [hasil aksi dari environment] ... (ulangi sampai task selesai) Kelebihan vs pure tool calling: - LLM dapat menjelaskan reasoning - Dapat handle ambiguity lebih baik - Chain of thought natural
def react_agent(task, robot, max_steps=10):
    """ReAct robot agent loop"""
    prompt = f"""
Kamu adalah robot AI. Selesaikan tugas dengan loop Thought->Action->Observation.

Aksi tersedia:
- scan_environment() -> list obstacles dan objek
- navigate_to(x, y) -> bergerak ke koordinat
- pick(object) -> ambil objek
- place(location) -> taruh objek
- check_goal() -> cek apakah goal tercapai

Task: {task}

Format output:
Thought: [reasoning]
Action: function_name(args)
"""
    messages = [{"role":"user","content":prompt}]
    
    for step in range(max_steps):
        response = client.chat.completions.create(
            model="gpt-4o", messages=messages)
        output = response.choices[0].message.content
        print(f"Step {step}: {output}")
        
        if "Action:" in output:
            action_line = [l for l in output.split('\n') if l.startswith('Action:')][0]
            action_str = action_line.replace('Action:','').strip()
            obs = robot.execute(action_str)
            messages.append({"role":"assistant","content":output})
            messages.append({"role":"user","content":f"Observation: {obs}"})
        elif "DONE" in output:
            break
    return "Task complete"
07
LLM

LOCAL LLM DENGAN OLLAMA

Ollama memungkinkan menjalankan LLM (Llama 3, Mistral, Phi-3) secara offline di Jetson atau server lokal. Penting untuk privasi dan latensi rendah.

import ollama

# Install: curl -fsSL https://ollama.ai/install.sh | sh
# Pull model: ollama pull llama3.2:3b

class LocalRobotLLM:
    def __init__(self, model='llama3.2:3b'):
        self.model = model
        self.history = []
        self.system = """Kamu adalah robot AI bernama ARIA. Tugasmu membantu manusia dengan melakukan navigasi, mengambil objek, dan memberikan informasi. Gunakan Bahasa Indonesia."""

    def chat(self, user_msg):
        self.history.append({"role":"user","content":user_msg})
        response = ollama.chat(
            model=self.model,
            messages=[{"role":"system","content":self.system}] + self.history,
            options={"temperature":0.1,"num_ctx":4096})
        reply = response['message']['content']
        self.history.append({"role":"assistant","content":reply})
        return reply

    def plan_task(self, task, current_state):
        prompt = f"Situasi robot: {current_state}\nTugas: {task}\nBuat rencana langkah demi langkah dalam format JSON."
        response = ollama.generate(
            model=self.model, prompt=prompt,
            format='json', options={"temperature":0})
        import json
        return json.loads(response['response'])

# Contoh model size/speed on Jetson Orin NX 16GB:
# phi3:mini (3.8B) → ~15 tok/s
# llama3.2:3b     → ~12 tok/s
# mistral:7b      → ~7  tok/s (butuh quantization)
# llava:7b        → ~6  tok/s (multimodal, bisa lihat gambar)
08
Embodied AI

PIPELINE EMBODIED AI

Embodied AI mengintegrasikan persepsi (vision + audio), reasoning (LLM), dan aksi (robot control) dalam loop yang continuous.

Embodied AI Loop: Perception: Camera frame → VLM / YOLOv8 → object list, scene description Microphone → VAD → Whisper → text command LiDAR scan → local map update Reasoning (LLM): scene_description + command + robot_state → action_plan Action: action_plan → ROS2 Nav2 / Manipulator → execute + TTS feedback
import asyncio, threading
from queue import Queue

class EmbodiedRobot:
    def __init__(self):
        self.cmd_q  = Queue()
        self.obs_q  = Queue()
        self.llm    = LocalRobotLLM('llava:7b')  # multimodal
        self.stt    = WhisperSTT('small')
        self.vad    = VoiceActivityDetector()
        self.camera = cv2.VideoCapture(0)

    def perception_thread(self):
        """Continuous perception loop"""
        while True:
            ret, frame = self.camera.read()
            self.obs_q.put({"type":"vision","data":frame})
            audio = self.vad.record_utterance()
            if audio is not None:
                text = self.stt.transcribe(audio)
                self.cmd_q.put({"type":"speech","text":text})

    def reasoning_thread(self):
        """LLM reasoning + action planning"""
        while True:
            cmd = self.cmd_q.get()
            obs = None
            try: obs = self.obs_q.get_nowait()
            except: pass
            plan = self.llm.plan_task(cmd['text'], obs)
            self.execute_plan(plan)

    def run(self):
        t1=threading.Thread(target=self.perception_thread, daemon=True)
        t2=threading.Thread(target=self.reasoning_thread, daemon=True)
        t1.start(); t2.start(); t1.join()
09
Memory

RAG ROBOT MEMORY

Robot dapat memiliki memori episodik yang bisa di-query dengan RAG (Retrieval Augmented Generation). Sangat berguna untuk robot yang bekerja di lingkungan familiar.

from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import datetime

class RobotMemory:
    """Episodic memory dengan vector search"""
    def __init__(self):
        self.embed = OllamaEmbeddings(model='nomic-embed-text')
        self.db = Chroma(persist_directory='robot_memory',
                          embedding_function=self.embed)

    def remember(self, event, metadata=None):
        doc = f"[{datetime.datetime.now().isoformat()}] {event}"
        meta = metadata or {}
        meta['timestamp'] = str(datetime.datetime.now())
        self.db.add_texts([doc], metadatas=[meta])

    def recall(self, query, k=5):
        results = self.db.similarity_search(query, k=k)
        return [r.page_content for r in results]

    def update_map_knowledge(self, area, description):
        self.remember(
            f"Area {area}: {description}",
            metadata={"type":"map","area":area})

# Contoh penggunaan:
memory = RobotMemory()
memory.remember("Ditemukan kotak merah di koordinat (2.5, 1.0)")
memory.remember("Pintu ruang tamu sering tertutup sore hari")
memory.update_map_knowledge("dapur", "Ada kulkas dan meja, pencahayaan terang")

hasil = memory.recall("dimana kotak merah?")
print(hasil)  # → "Ditemukan kotak merah di (2.5, 1.0)"
010
Vision LM

VISION LANGUAGE MODEL

Model seperti LLaVA, InternVL, atau Moondream memungkinkan robot memahami scene kamera dalam bahasa alami.

import ollama, cv2, base64, numpy as np

class VisionLM:
    def __init__(self, model='llava:7b'):
        self.model = model

    def describe_scene(self, frame):
        _, buf = cv2.imencode('.jpg', frame)
        img_b64 = base64.b64encode(buf).decode()
        res = ollama.generate(
            model=self.model,
            prompt="Deskripsikan scene ini untuk robot. Sebutkan: objek, posisi relatif, hal yang perlu diperhatikan untuk navigasi. Bahasa Indonesia.",
            images=[img_b64])
        return res['response']

    def find_object(self, frame, object_name):
        _, buf = cv2.imencode('.jpg', frame)
        img_b64 = base64.b64encode(buf).decode()
        res = ollama.generate(
            model=self.model,
            prompt=f"Apakah ada {object_name} dalam gambar? Jika ada, sebutkan posisinya (kiri/tengah/kanan, dekat/jauh). Jawab JSON: {{\"found\": bool, \"position\": string}}",
            images=[img_b64],
            format='json')
        import json
        return json.loads(res['response'])

# Contoh: robot mencari gelas air
vlm = VisionLM()
cap = cv2.VideoCapture(0)
ret, frame = cap.read()
result = vlm.find_object(frame, "gelas air")
print(result)  # {"found": true, "position": "kanan, dekat"}
LLM & DEEP LEARNING ROBOT — DOC 02 / 07
Antonius - bluedragonsec.com